Source code for skmultilearn.model_selection.measures

# -*- coding: utf-8 -*-
import numpy as np
import itertools as it


[docs]def example_distribution(folds, desired_size):
    """Examples Distribution (ED) measure

     Examples Distribution is a measure of how much a given fold's size deviates from the desired number
     of samples in each of the folds.

    Parameters:
    -----------
    folds : List[List[int]], shape = (n_folds)
        list of indexes of samples assigned per fold

    desired_size : List[int], shape = (n_folds)
        desired number of samples in each fold

    Returns
    -------
    example_distribution_score : float

        The example distribution score
    """
    n_splits = float(len(folds))

    return np.sum(
        np.abs(len(fold) - desired_fold_size) for fold, desired_fold_size in zip(folds, desired_size)
    ) / n_splits


[docs]def get_indicator_representation(row):
    """Convert binary indicator to list of assigned labels

    Parameters:
    -----------

    row : List[{0,1}]
        binary indicator list whether i-th label is assigned or not

    Returns
    -------
    np.array[int]
        list of assigned labels
    """
    return np.where(row != 0)[0]


[docs]def get_combination_wise_output_matrix(y, order):
    """Returns label combinations of a given order that are assigned to each row

    Parameters:
    -----------
    y : output matrix or array of arrays (n_samples, n_labels)
        the binary-indicator label assignment per sample representation of the output space

    order : int, >= 1
        the order of label relationship to take into account when balancing sample distribution across labels

    Returns
    -------
    combinations_per_row : List[Set[Tuple[int]]]
        list of combination assignments per row
    """
    return np.array([set(tuple(combination) for combination in
                         it.combinations_with_replacement(get_indicator_representation(row), order)) for row in y])


[docs]def get_unique_combinations(combinations_per_row):
    """Performs set.union on a list of sets

    Parameters
    ----------

    combinations_per_row : List[Set[Tuple[int]]]
        list of combination assignments per row

    Returns
    -------
    Set[Tuple[int]]
        all unique label combinations
    """
    return set.union(*combinations_per_row)


[docs]def folds_without_evidence_for_at_least_one_label_combination(y, folds, order=1):
    """Counts the number of folds without evidence for a given Label, Label Pair or Label Combination (FZ, FZLP, FZLC) measure

    A general implementation of FZ - the number of folds that contain at least one label combination of order
    :code:`order` with no positive examples. With :code:`order` = 1, it becomes the FZ measure from Katakis et.al's
    original paper.

    Parameters:
    -----------
    y : output matrix or array of arrays (n_samples, n_labels)
        the binary-indicator label assignment per sample representation of the output space

    folds : List[List[int]], shape = (n_folds)
        list of indexes of samples assigned per fold

    order : int, >= 1
        the order of label relationship to take into account when balancing sample distribution across labels

    Returns
    -------
    score : float
        the number of folds with missing evidence for at least one label combination
    """
    combinations_per_row = get_combination_wise_output_matrix(y, order)
    all_combinations = get_unique_combinations(combinations_per_row)
    return np.sum([get_unique_combinations(combinations_per_row[[fold]]) != all_combinations for fold in folds])


[docs]def folds_label_combination_pairs_without_evidence(y, folds, order):
    """Fold - Label / Label Pair / Label Combination (FLZ, FLPZ, FLCZ)  pair count measure

    A general implementation of FLZ - the number of pairs of fold and label combination of a given order for which
    there is no positive evidence in that fold for that combination. With :code:`order` = 1, it becomes the FLZ
    measure from Katakis et.al's original paper, with :code:`order` = 2, it becomes the FLPZ measure from
    Szymański et. al.'s paper.

    Parameters:
    -----------
    y : output matrix or array of arrays (n_samples, n_labels)
        the binary-indicator label assignment per sample representation of the output space

    folds : List[List[int]], shape = (n_folds)
        list of indexes of samples assigned per fold

    order : int, >= 1
        the order of label relationship to take into account when balancing sample distribution across labels

    Returns
    -------
    score : float
        the number of fold-label combination pairs with missing evidence
    """
    combinations_per_row = get_combination_wise_output_matrix(y, order)
    all_combinations = get_unique_combinations(combinations_per_row)
    return np.sum(
        [len(all_combinations.difference(get_unique_combinations(combinations_per_row[[fold]]))) for fold in folds])


[docs]def percentage_of_label_combinations_without_evidence_per_fold(y, folds, order):
    """Fold - Label / Label Pair / Label Combination (FLZ, FLPZ, FLCZ)  pair count measure

    A general implementation of FLZ - the number of pairs of fold and label combination of a given order for which
    there is no positive evidence in that fold for that combination. With :code:`order` = 1, it becomes the FLZ
    measure from Katakis et.al's original paper, with :code:`order` = 2, it becomes the FLPZ measure from
    Szymański et. al.'s paper.

    Parameters:
    -----------
    y : output matrix or array of arrays (n_samples, n_labels)
        the binary-indicator label assignment per sample representation of the output space

    folds : List[List[int]], shape = (n_folds)
        list of indexes of samples assigned per fold

    order : int, >= 1
        the order of label relationship to take into account when balancing sample distribution across labels

    Returns
    -------
    score : float
        the number of fold-label combination pairs with missing evidence
    """
    combinations_per_row = get_combination_wise_output_matrix(y, order)
    all_combinations = get_unique_combinations(combinations_per_row)
    number_of_combinations = float(len(all_combinations))
    return [
        1.0 - len(get_unique_combinations(combinations_per_row[[fold]])) / number_of_combinations for fold in folds
    ]


[docs]def label_combination_distribution(y, folds, order):
    """Label / Label Pair / Label Combination Distribution (LD, LPD, LCZD) measure

    A general implementation of Label / Label Pair / Label Combination Distribution - a measure that evaluates
    how the proportion of positive evidence for a label / label pair / label combination to the negative evidence
    for a label (pair/combination) deviates from the same proportion in the entire data set, averaged over all folds and labels.

    With :code:`order` = 1, it becomes the LD measure from Katakis et.al's original paper, with :code:`order` = 2, it
    becomes the LPD measure from Szymański et. al.'s paper.

    Parameters:
    -----------
    y : output matrix or array of arrays (n_samples, n_labels)
        the binary-indicator label assignment per sample representation of the output space

    folds : List[List[int]], shape = (n_folds)
        list of indexes of samples assigned per fold

    order : int, >= 1
        the order of label relationship to take into account when balancing sample distribution across labels

    Returns
    -------
    score : float
        the label / label pair / label combination distribution score
    """

    def _get_proportion(x, y):
        return y / float(x - y)

    combinations_per_row = get_combination_wise_output_matrix(y, order)
    all_combinations = get_unique_combinations(combinations_per_row)
    number_of_samples = y.shape[0]
    number_of_combinations = float(len(all_combinations))
    number_of_folds = float(len(folds))

    external_sum = 0
    for combination in all_combinations:
        number_of_samples_with_combination = np.sum([
            1 for combinations_in_row in combinations_per_row if combination in combinations_in_row
        ])

        d = _get_proportion(number_of_samples, number_of_samples_with_combination)
        internal_sum = 0
        for fold in folds:
            S_i_j = np.sum(
                [1 for combinations_in_row in combinations_per_row[fold] if combination in combinations_in_row])
            fold_size = len(fold)
            s = _get_proportion(fold_size, S_i_j)
            internal_sum += np.abs(s - d)

        internal_sum /= number_of_folds
        external_sum += internal_sum

    return external_sum / number_of_combinations