Source code for mapie.metrics

from typing import Optional, cast, Union

import numpy as np
from sklearn.utils.validation import check_array, column_or_1d

from ._typing import ArrayLike, NDArray
from .utils import (calc_bins,
                    check_array_shape_classification,
                    check_array_shape_regression,
                    check_binary_zero_one,
                    check_nb_intervals_sizes,
                    check_nb_sets_sizes,
                    check_number_bins,
                    check_split_strategy)


[docs]def regression_coverage_score( y_true: ArrayLike, y_pred_low: ArrayLike, y_pred_up: ArrayLike, ) -> float: """ Effective coverage score obtained by the prediction intervals. The effective coverage is obtained by estimating the fraction of true labels that lie within the prediction intervals. Parameters ---------- y_true: ArrayLike of shape (n_samples,) True labels. y_pred_low: ArrayLike of shape (n_samples,) Lower bound of prediction intervals. y_pred_up: ArrayLike of shape (n_samples,) Upper bound of prediction intervals. Returns ------- float Effective coverage obtained by the prediction intervals. Examples -------- >>> from mapie.metrics import regression_coverage_score >>> import numpy as np >>> y_true = np.array([5, 7.5, 9.5, 10.5, 12.5]) >>> y_pred_low = np.array([4, 6, 9, 8.5, 10.5]) >>> y_pred_up = np.array([6, 9, 10, 12.5, 12]) >>> print(regression_coverage_score(y_true, y_pred_low, y_pred_up)) 0.8 """ y_true = cast(NDArray, column_or_1d(y_true)) y_pred_low = cast(NDArray, column_or_1d(y_pred_low)) y_pred_up = cast(NDArray, column_or_1d(y_pred_up)) coverage = np.mean( ((y_pred_low <= y_true) & (y_pred_up >= y_true)) ) return float(coverage)
[docs]def classification_coverage_score( y_true: ArrayLike, y_pred_set: ArrayLike ) -> float: """ Effective coverage score obtained by the prediction sets. The effective coverage is obtained by estimating the fraction of true labels that lie within the prediction sets. Parameters ---------- y_true: ArrayLike of shape (n_samples,) True labels. y_pred_set: ArrayLike of shape (n_samples, n_class) Prediction sets given by booleans of labels. Returns ------- float Effective coverage obtained by the prediction sets. Examples -------- >>> from mapie.metrics import classification_coverage_score >>> import numpy as np >>> y_true = np.array([3, 3, 1, 2, 2]) >>> y_pred_set = np.array([ ... [False, False, True, True], ... [False, True, False, True], ... [False, True, True, False], ... [False, False, True, True], ... [False, True, False, True] ... ]) >>> print(classification_coverage_score(y_true, y_pred_set)) 0.8 """ y_true = cast(NDArray, column_or_1d(y_true)) y_pred_set = cast( NDArray, check_array( y_pred_set, force_all_finite=True, dtype=["bool"] ) ) coverage = np.take_along_axis( y_pred_set, y_true.reshape(-1, 1), axis=1 ).mean() return float(coverage)
[docs]def regression_mean_width_score( y_pred_low: ArrayLike, y_pred_up: ArrayLike ) -> float: """ Effective mean width score obtained by the prediction intervals. Parameters ---------- y_pred_low: ArrayLike of shape (n_samples,) Lower bound of prediction intervals. y_pred_up: ArrayLike of shape (n_samples,) Upper bound of prediction intervals. Returns ------- float Effective mean width of the prediction intervals. Examples -------- >>> from mapie.metrics import regression_mean_width_score >>> import numpy as np >>> y_pred_low = np.array([4, 6, 9, 8.5, 10.5]) >>> y_pred_up = np.array([6, 9, 10, 12.5, 12]) >>> print(regression_mean_width_score(y_pred_low, y_pred_up)) 2.3 """ y_pred_low = cast(NDArray, column_or_1d(y_pred_low)) y_pred_up = cast(NDArray, column_or_1d(y_pred_up)) mean_width = np.abs(y_pred_up - y_pred_low).mean() return float(mean_width)
[docs]def classification_mean_width_score(y_pred_set: ArrayLike) -> float: """ Mean width of prediction set output by :class:`~mapie.classification.MapieClassifier`. Parameters ---------- y_pred_set: ArrayLike of shape (n_samples, n_class) Prediction sets given by booleans of labels. Returns ------- float Mean width of the prediction set. Examples -------- >>> from mapie.metrics import classification_mean_width_score >>> import numpy as np >>> y_pred_set = np.array([ ... [False, False, True, True], ... [False, True, False, True], ... [False, True, True, False], ... [False, False, True, True], ... [False, True, False, True] ... ]) >>> print(classification_mean_width_score(y_pred_set)) 2.0 """ y_pred_set = cast( NDArray, check_array( y_pred_set, force_all_finite=True, dtype=["bool"] ) ) mean_width = y_pred_set.sum(axis=1).mean() return float(mean_width)
[docs]def expected_calibration_error( y_true: ArrayLike, y_scores: ArrayLike, num_bins: int = 50, split_strategy: Optional[str] = None, ) -> float: """ The expected calibration error, which is the difference between the confidence scores and accuracy per bin [1]. [1] Naeini, Mahdi Pakdaman, Gregory Cooper, and Milos Hauskrecht. "Obtaining well calibrated probabilities using bayesian binning." Twenty-Ninth AAAI Conference on Artificial Intelligence. 2015. Parameters ---------- y_true: ArrayLike of shape (n_samples,) The target values for the calibrator. y_score: ArrayLike of shape (n_samples,) or (n_samples, n_classes) The predictions scores. num_bins: int Number of bins to make the split in the y_score. The allowed values are num_bins above 0. split_strategy: str The way of splitting the predictions into different bins. The allowed split strategies are "uniform", "quantile" and "array split". Returns ------- float The score of ECE (Expected Calibration Error). """ split_strategy = check_split_strategy(split_strategy) num_bins = check_number_bins(num_bins) y_true_ = check_binary_zero_one(y_true) y_scores = cast(NDArray, y_scores) if np.size(y_scores.shape) == 2: y_score = cast( NDArray, column_or_1d(np.nanmax(y_scores, axis=1)) ) else: y_score = cast(NDArray, column_or_1d(y_scores)) _, bin_accs, bin_confs, bin_sizes = calc_bins( y_true_, y_score, num_bins, split_strategy ) return np.divide( np.sum(bin_sizes * np.abs(bin_accs - bin_confs)), np.sum(bin_sizes) )
[docs]def top_label_ece( y_true: ArrayLike, y_scores: ArrayLike, y_score_arg: Optional[ArrayLike] = None, num_bins: int = 50, split_strategy: Optional[str] = None, classes: Optional[ArrayLike] = None, ) -> float: """ The Top-Label ECE which is a method adapted to fit the ECE to a Top-Label setting [2]. [2] Gupta, Chirag, and Aaditya K. Ramdas. "Top-label calibration and multiclass-to-binary reductions." arXiv preprint arXiv:2107.08353 (2021). Parameters ---------- y_true: ArrayLike of shape (n_samples,) The target values for the calibrator. y_scores: ArrayLike of shape (n_samples, n_classes) or (n_samples,) The predictions scores, either the maximum score and the argmax needs to be inputted or in the form of the prediction probabilities. y_score_arg: Optional[ArrayLike] of shape (n_samples,) If only the maximum is provided in the y_scores, the argmax must be provided here. This is optional and could be directly infered from the y_scores. num_bins: int Number of bins to make the split in the y_score. The allowed values are num_bins above 0. split_strategy: str The way of splitting the predictions into different bins. The allowed split strategies are "uniform", "quantile" and "array split". classes: ArrayLike of shape (n_samples,) The different classes, in order of the indices that would be present in a pred_proba. Returns ------- float The ECE score adapted in the top label setting. """ ece = float(0.) split_strategy = check_split_strategy(split_strategy) num_bins = check_number_bins(num_bins) y_true = cast(NDArray, column_or_1d(y_true)) if y_score_arg is None: y_score = cast( NDArray, column_or_1d(np.nanmax(y_scores, axis=1)) ) if classes is None: y_score_arg = cast( NDArray, column_or_1d(np.nanargmax(y_scores, axis=1)) ) else: classes = cast(NDArray, classes) y_score_arg = cast( NDArray, column_or_1d(classes[np.nanargmax(y_scores, axis=1)]) ) else: y_score = cast(NDArray, column_or_1d(y_scores)) y_score_arg = cast(NDArray, column_or_1d(y_score_arg)) labels = np.unique(y_score_arg) for label in labels: label_ind = np.where(label == y_score_arg)[0] y_true_ = np.array(y_true[label_ind] == label, dtype=int) ece += expected_calibration_error( y_true_, y_scores=y_score[label_ind], num_bins=num_bins, split_strategy=split_strategy ) ece /= len(labels) return ece
[docs]def regression_coverage_score_v2( y_true: NDArray, y_intervals: NDArray, ) -> NDArray: """ Effective coverage score obtained by the prediction intervals. The effective coverage is obtained by estimating the fraction of true labels that lie within the prediction intervals. It is different from ``regression_coverage_score`` because it uses directly the output of ``predict`` method and can compute the coverage for each alpha. Parameters ---------- y_true: NDArray of shape (n_samples, n_alpha) or (n_samples,) True labels. y_intervals: NDArray of shape (n_samples, 2, n_alpha) Lower and upper bound of prediction intervals with different alpha risks. Returns ------- NDArray of shape (n_alpha,) Effective coverage obtained by the prediction intervals. """ y_intervals = check_array_shape_regression(y_true, y_intervals) if len(y_true.shape) != 2: y_true = cast(NDArray, column_or_1d(y_true)) y_true = np.expand_dims(y_true, axis=1) coverages = np.mean( np.logical_and( np.less_equal(y_intervals[:, 0, :], y_true), np.greater_equal(y_intervals[:, 1, :], y_true) ), axis=0 ) return coverages
[docs]def classification_coverage_score_v2( y_true: NDArray, y_pred_set: NDArray ) -> NDArray: """ Effective coverage score obtained by the prediction sets. The effective coverage is obtained by estimating the fraction of true labels that lie within the prediction sets. It is different from ``classification_coverage_score`` because it uses directly the output of ``predict`` method and can compute the coverage for each alpha. Parameters ---------- y_true: NDArray of shape (n_samples, n_alpha) or (n_samples,) True labels. y_pred_set: NDArray of shape (n_samples, n_class, n_alpha) Prediction sets given by booleans of labels. Returns ------- NDArray of shape (n_alpha,) Effective coverage obtained by the prediction sets. """ y_pred_set = check_array_shape_classification(y_true, y_pred_set) if len(y_true.shape) != 2: y_true = cast(NDArray, column_or_1d(y_true)) y_true = np.expand_dims(y_true, axis=1) y_true = np.expand_dims(y_true, axis=1) coverage = np.nanmean( np.take_along_axis(y_pred_set, y_true, axis=1), axis=0 ) return coverage[0]
[docs]def regression_ssc( y_true: NDArray, y_intervals: NDArray, num_bins: int = 3 ) -> NDArray: """ Compute Size-Stratified Coverage metrics proposed in [3] that is the conditional coverage conditioned by the size of the intervals. The intervals are ranked by their size (ascending) and then divided into num_bins groups: one value of coverage by groups is computed. Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted. [3] Angelopoulos, A. N., & Bates, S. (2021). A gentle introduction to conformal prediction and distribution-free uncertainty quantification. arXiv preprint arXiv:2107.07511. Parameters ---------- y_true: NDArray of shape (n_samples,) True labels. y_intervals: NDArray of shape (n_samples, 2, n_alpha) or (n_samples, 2) Prediction intervals given by booleans of labels. num_bins: int n Number of groups. Should be less than the number of different interval widths. Returns ------- NDArray of shape (n_alpha, num_bins) Examples -------- >>> from mapie.metrics import regression_ssc >>> import numpy as np >>> y_true = np.array([5, 7.5, 9.5]) >>> y_intervals = np.array([ ... [4, 6], ... [6.0, 9.0], ... [9, 10.0] ... ]) >>> print(regression_ssc(y_true, y_intervals, num_bins=2)) [[1. 1.]] """ y_true = cast(NDArray, column_or_1d(y_true)) y_intervals = check_array_shape_regression(y_true, y_intervals) check_number_bins(num_bins) widths = np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :]) check_nb_intervals_sizes(widths, num_bins) indexes_sorted = np.argsort(widths, axis=0) indexes_bybins = np.array_split(indexes_sorted, num_bins, axis=0) coverages = np.zeros((y_intervals.shape[2], num_bins)) for i, indexes in enumerate(indexes_bybins): intervals_binned = np.stack([ np.take_along_axis(y_intervals[:, 0, :], indexes, axis=0), np.take_along_axis(y_intervals[:, 1, :], indexes, axis=0) ], axis=1) coverages[:, i] = regression_coverage_score_v2(y_true[indexes], intervals_binned) return coverages
[docs]def regression_ssc_score( y_true: NDArray, y_intervals: NDArray, num_bins: int = 3 ) -> NDArray: """ Aggregate by the minimum for each alpha the Size-Stratified Coverage [3]: returns the maximum violation of the conditional coverage (with the groups defined). Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted. Parameters ---------- y_true: NDArray of shape (n_samples,) True labels. y_intervals: NDArray of shape (n_samples, 2, n_alpha) or (n_samples, 2) Prediction intervals given by booleans of labels. num_bins: int n Number of groups. Should be less than the number of different interval widths. Returns ------- NDArray of shape (n_alpha,) Examples -------- >>> from mapie.metrics import regression_ssc >>> import numpy as np >>> y_true = np.array([5, 7.5, 9.5]) >>> y_intervals = np.array([ ... [[4, 4], [6, 7.5]], ... [[6.0, 8], [9.0, 10]], ... [[9, 9], [10.0, 10.0]] ... ]) >>> print(regression_ssc_score(y_true, y_intervals, num_bins=2)) [1. 0.5] """ return np.min(regression_ssc(y_true, y_intervals, num_bins), axis=1)
[docs]def classification_ssc( y_true: NDArray, y_pred_set: NDArray, num_bins: Union[int, None] = None ) -> NDArray: """ Compute Size-Stratified Coverage metrics proposed in [3] that is the conditional coverage conditioned by the size of the predictions sets. The sets are ranked by their size (ascending) and then divided into num_bins groups: one value of coverage by groups is computed. [3] Angelopoulos, A. N., & Bates, S. (2021). A gentle introduction to conformal prediction and distribution-free uncertainty quantification. arXiv preprint arXiv:2107.07511. Parameters ---------- y_true: NDArray of shape (n_samples,) True labels. y_pred_set: NDArray of shape (n_samples, n_class, n_alpha) or (n_samples, n_class) Prediction sets given by booleans of labels. num_bins: int or None Number of groups. If None, one value of coverage by possible size of sets (n_classes +1) is computed. Should be less than the number of different set sizes. Returns ------- NDArray of shape (n_alpha, num_bins) Examples -------- >>> from mapie.metrics import classification_ssc >>> import numpy as np >>> y_true = y_true_class = np.array([3, 3, 1, 2, 2]) >>> y_pred_set = np.array([ ... [True, True, True, True], ... [False, True, False, True], ... [True, True, True, False], ... [False, False, True, True], ... [True, True, False, True]]) >>> print(classification_ssc(y_true, y_pred_set, num_bins=2)) [[1. 0.66666667]] """ y_true = cast(NDArray, column_or_1d(y_true)) y_pred_set = check_array_shape_classification(y_true, y_pred_set) sizes = np.sum(y_pred_set, axis=1) n_classes = y_pred_set.shape[1] if num_bins is None: bins = list(range(n_classes + 1)) else: check_nb_sets_sizes(sizes, num_bins) check_number_bins(num_bins) bins = [ b[0] for b in np.array_split(range(n_classes + 1), num_bins) ] digitized_sizes = np.digitize(sizes, bins) coverages = np.zeros((y_pred_set.shape[2], len(bins))) for alpha in range(y_pred_set.shape[2]): indexes_bybins = [ np.argwhere(digitized_sizes[:, alpha] == i) for i in range(1, len(bins)+1) ] for i, indexes in enumerate(indexes_bybins): coverages[alpha, i] = classification_coverage_score_v2( y_true[indexes], np.take_along_axis( y_pred_set[:, :, alpha], indexes, axis=0 ) ) return coverages
[docs]def classification_ssc_score( y_true: NDArray, y_pred_set: NDArray, num_bins: Union[int, None] = None ) -> NDArray: """ Aggregate by the minimum for each alpha the Size-Stratified Coverage [3]: returns the maximum violation of the conditional coverage (with the groups defined). Parameters ---------- y_true: NDArray of shape (n_samples,) True labels. y_pred_set: NDArray of shape (n_samples, n_class, n_alpha) or (n_samples, n_class) Prediction sets given by booleans of labels. num_bins: int or None Number of groups. If None, one value of coverage by possible size of sets (n_classes +1) is computed. Should be less than the number of different set sizes. Returns ------- NDArray of shape (n_alpha,) Examples -------- >>> from mapie.metrics import classification_ssc_score >>> import numpy as np >>> y_true = y_true_class = np.array([3, 3, 1, 2, 2]) >>> y_pred_set = np.array([ ... [True, True, True, True], ... [False, True, False, True], ... [True, True, True, False], ... [False, False, True, True], ... [True, True, False, True]]) >>> print(classification_ssc_score(y_true, y_pred_set, num_bins=2)) [0.66666667] """ return np.nanmin(classification_ssc(y_true, y_pred_set, num_bins), axis=1)
def _gaussian_kernel( x: NDArray, kernel_size: int ) -> NDArray: """ Computes the gaussian kernel of x. (Used in hsic function) Parameters ---------- x: NDArray The values from which to compute the gaussian kernel. kernel_size: int The variance (sigma), this coefficient controls the width of the curve. """ norm_x = x ** 2 dist = -2 * np.matmul(x, x.transpose((0, 2, 1))) \ + norm_x + norm_x.transpose((0, 2, 1)) return np.exp(-dist / kernel_size)
[docs]def hsic( y_true: NDArray, y_intervals: NDArray, kernel_sizes: ArrayLike = (1, 1) ) -> NDArray: """ Compute the square root of the hsic coefficient. HSIC is Hilbert-Schmidt independence criterion that is a correlation measure. Here we use it as proposed in [4], to compute the correlation between the indicator of coverage and the interval size. If hsic is 0, the two variables (the indicator of coverage and the interval size) are independant. Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted. [4] Feldman, S., Bates, S., & Romano, Y. (2021). Improving conditional coverage via orthogonal quantile regression. Advances in Neural Information Processing Systems, 34, 2060-2071. Parameters ---------- y_true: NDArray of shape (n_samples,) True labels. y_intervals: NDArray of shape (n_samples, 2, n_alpha) or (n_samples, 2) Prediction sets given by booleans of labels. kernel_sizes: ArrayLike of size (2,) The variance (sigma) for each variable (the indicator of coverage and the interval size), this coefficient controls the width of the curve. Returns ------- NDArray of shape (n_alpha,) One hsic correlation coefficient by alpha. Raises ------ ValueError If kernel_sizes has a length different from 2 and if it has negative or null values. Examples ------- >>> from mapie.metrics import hsic >>> import numpy as np >>> y_true = np.array([9.5, 10.5, 12.5]) >>> y_intervals = np.array([ ... [[9, 9], [10.0, 10.0]], ... [[8.5, 9], [12.5, 12]], ... [[10.5, 10.5], [12.0, 12]] ... ]) >>> print(hsic(y_true, y_intervals)) [0.31787614 0.2962914 ] """ y_true = cast(NDArray, column_or_1d(y_true)) y_intervals = check_array_shape_regression(y_true, y_intervals) kernel_sizes = cast(NDArray, column_or_1d(kernel_sizes)) if len(kernel_sizes) != 2: raise ValueError( "kernel_sizes should be an ArrayLike of length 2" ) if (kernel_sizes <= 0).any(): raise ValueError( "kernel_size should be positive" ) n_samples, _, n_alpha = y_intervals.shape y_true_per_alpha = np.tile(y_true, (n_alpha, 1)).transpose() widths = np.expand_dims( np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :]).transpose(), axis=2 ) cov_ind = np.expand_dims( np.int_( ((y_intervals[:, 0, :] <= y_true_per_alpha) & (y_intervals[:, 1, :] >= y_true_per_alpha)) ).transpose(), axis=2 ) k_mat = _gaussian_kernel(widths, kernel_sizes[0]) l_mat = _gaussian_kernel(cov_ind, kernel_sizes[1]) h_mat = np.eye(n_samples) - 1 / n_samples * np.ones((n_samples, n_samples)) hsic_mat = np.matmul(l_mat, np.matmul(h_mat, np.matmul(k_mat, h_mat))) hsic_mat /= ((n_samples - 1) ** 2) coef_hsic = np.sqrt(np.matrix.trace(hsic_mat, axis1=1, axis2=2)) return coef_hsic