Source code for skmultilearn.model_selection.measures

# -*- coding: utf-8 -*-
import numpy as np
import itertools as it


[docs]def example_distribution(folds, desired_size): """Examples Distribution (ED) measure Examples Distribution is a measure of how much a given fold's size deviates from the desired number of samples in each of the folds. Parameters: ----------- folds : List[List[int]], shape = (n_folds) list of indexes of samples assigned per fold desired_size : List[int], shape = (n_folds) desired number of samples in each fold Returns ------- example_distribution_score : float The example distribution score """ n_splits = float(len(folds)) return np.sum( np.abs(len(fold) - desired_fold_size) for fold, desired_fold_size in zip(folds, desired_size) ) / n_splits
[docs]def get_indicator_representation(row): """Convert binary indicator to list of assigned labels Parameters: ----------- row : List[{0,1}] binary indicator list whether i-th label is assigned or not Returns ------- np.array[int] list of assigned labels """ return np.where(row != 0)[0]
[docs]def get_combination_wise_output_matrix(y, order): """Returns label combinations of a given order that are assigned to each row Parameters: ----------- y : output matrix or array of arrays (n_samples, n_labels) the binary-indicator label assignment per sample representation of the output space order : int, >= 1 the order of label relationship to take into account when balancing sample distribution across labels Returns ------- combinations_per_row : List[Set[Tuple[int]]] list of combination assignments per row """ return np.array([set(tuple(combination) for combination in it.combinations_with_replacement(get_indicator_representation(row), order)) for row in y])
[docs]def get_unique_combinations(combinations_per_row): """Performs set.union on a list of sets Parameters ---------- combinations_per_row : List[Set[Tuple[int]]] list of combination assignments per row Returns ------- Set[Tuple[int]] all unique label combinations """ return set.union(*combinations_per_row)
[docs]def folds_without_evidence_for_at_least_one_label_combination(y, folds, order=1): """Counts the number of folds without evidence for a given Label, Label Pair or Label Combination (FZ, FZLP, FZLC) measure A general implementation of FZ - the number of folds that contain at least one label combination of order :code:`order` with no positive examples. With :code:`order` = 1, it becomes the FZ measure from Katakis et.al's original paper. Parameters: ----------- y : output matrix or array of arrays (n_samples, n_labels) the binary-indicator label assignment per sample representation of the output space folds : List[List[int]], shape = (n_folds) list of indexes of samples assigned per fold order : int, >= 1 the order of label relationship to take into account when balancing sample distribution across labels Returns ------- score : float the number of folds with missing evidence for at least one label combination """ combinations_per_row = get_combination_wise_output_matrix(y, order) all_combinations = get_unique_combinations(combinations_per_row) return np.sum([get_unique_combinations(combinations_per_row[[fold]]) != all_combinations for fold in folds])
[docs]def folds_label_combination_pairs_without_evidence(y, folds, order): """Fold - Label / Label Pair / Label Combination (FLZ, FLPZ, FLCZ) pair count measure A general implementation of FLZ - the number of pairs of fold and label combination of a given order for which there is no positive evidence in that fold for that combination. With :code:`order` = 1, it becomes the FLZ measure from Katakis et.al's original paper, with :code:`order` = 2, it becomes the FLPZ measure from Szymański et. al.'s paper. Parameters: ----------- y : output matrix or array of arrays (n_samples, n_labels) the binary-indicator label assignment per sample representation of the output space folds : List[List[int]], shape = (n_folds) list of indexes of samples assigned per fold order : int, >= 1 the order of label relationship to take into account when balancing sample distribution across labels Returns ------- score : float the number of fold-label combination pairs with missing evidence """ combinations_per_row = get_combination_wise_output_matrix(y, order) all_combinations = get_unique_combinations(combinations_per_row) return np.sum( [len(all_combinations.difference(get_unique_combinations(combinations_per_row[[fold]]))) for fold in folds])
[docs]def percentage_of_label_combinations_without_evidence_per_fold(y, folds, order): """Fold - Label / Label Pair / Label Combination (FLZ, FLPZ, FLCZ) pair count measure A general implementation of FLZ - the number of pairs of fold and label combination of a given order for which there is no positive evidence in that fold for that combination. With :code:`order` = 1, it becomes the FLZ measure from Katakis et.al's original paper, with :code:`order` = 2, it becomes the FLPZ measure from Szymański et. al.'s paper. Parameters: ----------- y : output matrix or array of arrays (n_samples, n_labels) the binary-indicator label assignment per sample representation of the output space folds : List[List[int]], shape = (n_folds) list of indexes of samples assigned per fold order : int, >= 1 the order of label relationship to take into account when balancing sample distribution across labels Returns ------- score : float the number of fold-label combination pairs with missing evidence """ combinations_per_row = get_combination_wise_output_matrix(y, order) all_combinations = get_unique_combinations(combinations_per_row) number_of_combinations = float(len(all_combinations)) return [ 1.0 - len(get_unique_combinations(combinations_per_row[[fold]])) / number_of_combinations for fold in folds ]
[docs]def label_combination_distribution(y, folds, order): """Label / Label Pair / Label Combination Distribution (LD, LPD, LCZD) measure A general implementation of Label / Label Pair / Label Combination Distribution - a measure that evaluates how the proportion of positive evidence for a label / label pair / label combination to the negative evidence for a label (pair/combination) deviates from the same proportion in the entire data set, averaged over all folds and labels. With :code:`order` = 1, it becomes the LD measure from Katakis et.al's original paper, with :code:`order` = 2, it becomes the LPD measure from Szymański et. al.'s paper. Parameters: ----------- y : output matrix or array of arrays (n_samples, n_labels) the binary-indicator label assignment per sample representation of the output space folds : List[List[int]], shape = (n_folds) list of indexes of samples assigned per fold order : int, >= 1 the order of label relationship to take into account when balancing sample distribution across labels Returns ------- score : float the label / label pair / label combination distribution score """ def _get_proportion(x, y): return y / float(x - y) combinations_per_row = get_combination_wise_output_matrix(y, order) all_combinations = get_unique_combinations(combinations_per_row) number_of_samples = y.shape[0] number_of_combinations = float(len(all_combinations)) number_of_folds = float(len(folds)) external_sum = 0 for combination in all_combinations: number_of_samples_with_combination = np.sum([ 1 for combinations_in_row in combinations_per_row if combination in combinations_in_row ]) d = _get_proportion(number_of_samples, number_of_samples_with_combination) internal_sum = 0 for fold in folds: S_i_j = np.sum( [1 for combinations_in_row in combinations_per_row[fold] if combination in combinations_in_row]) fold_size = len(fold) s = _get_proportion(fold_size, S_i_j) internal_sum += np.abs(s - d) internal_sum /= number_of_folds external_sum += internal_sum return external_sum / number_of_combinations