Source code for mapie.metrics.regression

from typing import cast

import numpy as np
from numpy.typing import ArrayLike, NDArray
from sklearn.utils import column_or_1d

from mapie.utils import (
    _check_arrays_length,
    _check_array_nan,
    _check_array_inf,
    _check_array_shape_regression,
    _check_number_bins,
    _check_nb_intervals_sizes,
    _check_alpha,
)


[docs]def regression_mean_width_score(
    y_intervals: NDArray
) -> NDArray:
    """
    Effective mean width score obtained by the prediction intervals.

    Parameters
    ----------
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level)
        Lower and upper bound of prediction intervals
        with different confidence levels, given by the ``predict_interval`` method

    Returns
    ---------
    NDArray of shape (n_confidence_level,)
        Effective mean width of the prediction intervals for each confidence level.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.regression import regression_mean_width_score
    >>> y_intervals = np.array([[[4, 6, 8], [6, 9, 11]],
    ...                    [[9, 10, 11], [10, 12, 14]],
    ...                    [[8.5, 9.5, 10], [12.5, 12, 13]],
    ...                    [[7, 8, 9], [8.5, 9.5, 10]],
    ...                    [[5, 6, 7], [6.5, 8, 9]]])
    >>> print(regression_mean_width_score(y_intervals))
    [2.  2.2 2.4]
    """
    y_intervals = np.asarray(y_intervals, dtype=float)

    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    width = np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :])
    mean_width = width.mean(axis=0)
    return mean_width


[docs]def regression_coverage_score(
    y_true: NDArray,
    y_intervals: NDArray,
) -> NDArray:
    """
    Effective coverage obtained by the prediction intervals.

    Intervals given by the ``predict_interval`` method can be passed directly
    to the ``y_intervals`` argument (see example below).

    Beside this intended use, this function also works with:

    - ``y_true`` of shape (n_sample,) and ``y_intervals`` of shape (n_sample, 2)
    - ``y_true`` of shape (n_sample, n) and `y_intervals` of shape
      (n_sample, 2, n)

    The effective coverage is obtained by computing the fraction
    of true labels that lie within the prediction intervals.

    Parameters
    ------------
    y_true: NDArray of shape (n_samples,)
        True labels.

    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level)
        Lower and upper bound of prediction intervals
        with different confidence levels, given by the ``predict_interval`` method

    Returns
    ---------
    NDArray of shape (n_confidence_level,)
        Effective coverage obtained by the prediction intervals
        for each confidence level.

    Examples
    ---------
    >>> from mapie.metrics.regression import regression_coverage_score
    >>> from mapie.regression import SplitConformalRegressor
    >>> from mapie.utils import train_conformalize_test_split
    >>> from sklearn.datasets import make_regression
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.linear_model import Ridge

    >>> X, y = make_regression(n_samples=500, n_features=2, noise=1.0)
    >>> (
    ...     X_train, X_conformalize, X_test,
    ...     y_train, y_conformalize, y_test
    ... ) = train_conformalize_test_split(
    ...     X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1
    ... )

    >>> mapie_regressor = SplitConformalRegressor(
    ...     estimator=Ridge(),
    ...     confidence_level=0.95,
    ...     prefit=False,
    ... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize)

    >>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test)
    >>> coverage = regression_coverage_score(y_test, predicted_intervals)[0]
    """
    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    y_intervals = _check_array_shape_regression(y_true, y_intervals)
    if len(y_true.shape) != 2:
        y_true = cast(NDArray, column_or_1d(y_true))
        y_true = np.expand_dims(y_true, axis=1)
    coverages = np.mean(
        np.logical_and(
            np.less_equal(y_intervals[:, 0, :], y_true),
            np.greater_equal(y_intervals[:, 1, :], y_true)
        ),
        axis=0
    )
    return coverages


[docs]def regression_ssc(
    y_true: NDArray,
    y_intervals: NDArray,
    num_bins: int = 3
) -> NDArray:
    """
    Compute Size-Stratified Coverage metrics proposed in [3] that is
    the conditional coverage conditioned by the size of the intervals.
    The intervals are ranked by their size (ascending) and then divided into
    num_bins groups: one value of coverage by groups is computed.

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [3] Angelopoulos, A. N., & Bates, S. (2021).
    A gentle introduction to conformal prediction and
    distribution-free uncertainty quantification.
    arXiv preprint arXiv:2107.07511.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction intervals given by booleans of labels.
    num_bins: int n
        Number of groups. Should be less than the number of different
        interval widths.

    Returns
    -------
    NDArray of shape (n_confidence_level, num_bins)

    Examples
    --------
    >>> from mapie.metrics.regression import regression_ssc
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5])
    >>> y_intervals = np.array([
    ... [4, 6],
    ... [6.0, 9.0],
    ... [9, 10.0]
    ... ])
    >>> print(regression_ssc(y_true, y_intervals, num_bins=2))
    [[1. 1.]]
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_intervals = _check_array_shape_regression(y_true, y_intervals)
    _check_number_bins(num_bins)
    widths = np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :])
    _check_nb_intervals_sizes(widths, num_bins)

    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    indexes_sorted = np.argsort(widths, axis=0)
    indexes_bybins = np.array_split(indexes_sorted, num_bins, axis=0)
    coverages = np.zeros((y_intervals.shape[2], num_bins))
    for i, indexes in enumerate(indexes_bybins):
        intervals_binned = np.stack([
            np.take_along_axis(y_intervals[:, 0, :], indexes, axis=0),
            np.take_along_axis(y_intervals[:, 1, :], indexes, axis=0)
        ], axis=1)
        coverages[:, i] = regression_coverage_score(y_true[indexes], intervals_binned)

    return coverages


[docs]def regression_ssc_score(
    y_true: NDArray,
    y_intervals: NDArray,
    num_bins: int = 3
) -> NDArray:
    """
    Aggregate by the minimum for each confidence level the Size-Stratified Coverage [3]:
    returns the maximum violation of the conditional coverage
    (with the groups defined).

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [3] Angelopoulos, A. N., & Bates, S. (2021).
    A gentle introduction to conformal prediction and
    distribution-free uncertainty quantification.
    arXiv preprint arXiv:2107.07511.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction intervals given by booleans of labels.
    num_bins: int n
        Number of groups. Should be less than the number of different
        interval widths.

    Returns
    -------
    NDArray of shape (n_confidence_level,)

    Examples
    --------
    >>> from mapie.metrics.regression import regression_ssc_score
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5])
    >>> y_intervals = np.array([
    ... [[4, 4], [6, 7.5]],
    ... [[6.0, 8], [9.0, 10]],
    ... [[9, 9], [10.0, 10.0]]
    ... ])
    >>> print(regression_ssc_score(y_true, y_intervals, num_bins=2))
    [1.  0.5]
    """
    return np.min(regression_ssc(y_true, y_intervals, num_bins), axis=1)


def _gaussian_kernel(
    x: NDArray,
    kernel_size: int
) -> NDArray:
    """
    Computes the gaussian kernel of x. (Used in hsic function)

    Parameters
    ----------
    x: NDArray
        The values from which to compute the gaussian kernel.
    kernel_size: int
        The variance (sigma), this coefficient controls the width of the curve.
    """
    norm_x = x ** 2
    dist = -2 * np.matmul(x, x.transpose((0, 2, 1))) \
        + norm_x + norm_x.transpose((0, 2, 1))
    return np.exp(-dist / kernel_size)


[docs]def hsic(
    y_true: NDArray,
    y_intervals: NDArray,
    kernel_sizes: ArrayLike = (1, 1)
) -> NDArray:
    """
    Compute the square root of the hsic coefficient. HSIC is Hilbert-Schmidt
    independence criterion that is a correlation measure. Here we use it as
    proposed in [4], to compute the correlation between the indicator of
    coverage and the interval size.

    If hsic is 0, the two variables (the indicator of coverage and the
    interval size) are independant.

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [4] Feldman, S., Bates, S., & Romano, Y. (2021).
    Improving conditional coverage via orthogonal quantile regression.
    Advances in Neural Information Processing Systems, 34, 2060-2071.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction sets given by booleans of labels.
    kernel_sizes: ArrayLike of size (2,)
        The variance (sigma) for each variable (the indicator of coverage and
        the interval size), this coefficient controls the width of the curve.

    Returns
    -------
    NDArray of shape (n_confidence_level,)
        One hsic correlation coefficient by confidence level.

    Raises
    ------
    ValueError
        If kernel_sizes has a length different from 2
        and if it has negative or null values.

    Examples
    --------
    >>> from mapie.metrics.regression import hsic
    >>> import numpy as np
    >>> y_true = np.array([9.5, 10.5, 12.5])
    >>> y_intervals = np.array([
    ... [[9, 9], [10.0, 10.0]],
    ... [[8.5, 9], [12.5, 12]],
    ... [[10.5, 10.5], [12.0, 12]]
    ... ])
    >>> print(hsic(y_true, y_intervals))
    [0.31787614 0.2962914 ]
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_intervals = _check_array_shape_regression(y_true, y_intervals)

    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    kernel_sizes = cast(NDArray, column_or_1d(kernel_sizes))
    if len(kernel_sizes) != 2:
        raise ValueError(
            "kernel_sizes should be an ArrayLike of length 2"
        )
    if (kernel_sizes <= 0).any():
        raise ValueError(
            "kernel_size should be positive"
        )
    n_samples, _, n_confidence_level = y_intervals.shape
    y_true_per_alpha = np.tile(y_true, (n_confidence_level, 1)).transpose()
    widths = np.expand_dims(
        np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :]).transpose(),
        axis=2
    )
    cov_ind = np.expand_dims(
        np.int_(
            ((y_intervals[:, 0, :] <= y_true_per_alpha) &
             (y_intervals[:, 1, :] >= y_true_per_alpha))
        ).transpose(),
        axis=2
    )

    k_mat = _gaussian_kernel(widths, kernel_sizes[0])
    l_mat = _gaussian_kernel(cov_ind, kernel_sizes[1])
    h_mat = np.eye(n_samples) - 1 / n_samples * np.ones((n_samples, n_samples))
    hsic_mat = np.matmul(l_mat, np.matmul(h_mat, np.matmul(k_mat, h_mat)))
    hsic_mat /= ((n_samples - 1) ** 2)
    coef_hsic = np.sqrt(np.matrix.trace(hsic_mat, axis1=1, axis2=2))

    return coef_hsic


[docs]def coverage_width_based(
    y_true: ArrayLike,
    y_pred_low: ArrayLike,
    y_pred_up: ArrayLike,
    eta: float,
    confidence_level: float
) -> float:
    """
    Coverage Width-based Criterion (CWC) obtained by the prediction intervals.

    The effective coverage score is a criterion used to evaluate the quality
    of prediction intervals (PIs) based on their coverage and width.

    Khosravi, Abbas, Saeid Nahavandi, and Doug Creighton.
    "Construction of optimal prediction intervals for load forecasting
    problems."
    IEEE Transactions on Power Systems 25.3 (2010): 1496-1503.

    Parameters
    ----------
    Coverage score : float
        Prediction interval coverage probability (Coverage score), which is
        the estimated fraction of true labels that lie within the prediction
        intervals.
    Mean Width Score : float
        Prediction interval normalized average width (Mean Width Score),
        calculated as the average width of the prediction intervals.
    eta : int
        A user-defined parameter that balances the contributions of
        Mean Width Score and Coverage score in the CWC calculation.
    confidence_level : float
        A user-defined parameter representing the designed confidence level of
        the PI.

    Returns
    -------
    float
        Effective coverage score (CWC) obtained by the prediction intervals.

    Notes
    -----
    The effective coverage score (CWC) is calculated using the following
    formula:
    CWC = (1 - Mean Width Score) * exp(-eta * (Coverage score - (1-alpha))**2)

    The CWC penalizes under- and overcoverage in the same way and summarizes
    the quality of the prediction intervals in a single value.

    High Eta (Large Positive Value):

    When eta is a high positive value, it will strongly
    emphasize the contribution of (1-Mean Width Score). This means that the
    algorithm will prioritize reducing the average width of the prediction
    intervals (Mean Width Score) over achieving a high coverage probability
    (Coverage score). The exponential term np.exp(-eta*(Coverage score -
    (1-alpha))**2) will have a sharp decline as Coverage score deviates
    from (1-alpha). So, achieving a high Coverage score becomes less important
    compared to minimizing Mean Width Score.
    The impact will be narrower prediction intervals on average, which may
    result in more precise but less conservative predictions.

    Low Eta (Small Positive Value):

    When eta is a low positive value, it will still
    prioritize reducing the average width of the prediction intervals
    (Mean Width Score) but with less emphasis compared to higher
    eta values.
    The exponential term will be less steep, meaning that deviations of
    Coverage score from (1-alpha) will have a moderate impact.
    You'll get a balance between prediction precision and coverage, but the
    exact balance will depend on the specific value of eta.

    Negative Eta (Any Negative Value):

    When eta is negative, it will have a different effect on the formula.
    Negative values of eta will cause the exponential term
    np.exp(-eta*(Coverage score - (1-alpha))**2) to become larger as
    Coverage score deviates from (1-alpha). This means that
    a negative eta prioritizes achieving a high coverage probability
    (Coverage score) over minimizing Mean Width Score.
    In this case, the algorithm will aim to produce wider prediction intervals
    to ensure a higher likelihood of capturing the true values within those
    intervals, even if it sacrifices precision.
    Negative eta values might be used in scenarios where avoiding errors or
    outliers is critical.

    Null Eta (Eta = 0):

    Specifically, when eta is zero, the CWC score becomes equal to
    (1 - Mean Width Score), which is equivalent to
    (1 - average width of the prediction intervals).
    Therefore, in this case, the CWC score is primarily based on the size of
    the prediction interval.

    Examples
    --------
    >>> from mapie.metrics.regression import coverage_width_based
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5, 10.5, 12.5])
    >>> y_preds_low = np.array([4, 6, 9, 8.5, 10.5])
    >>> y_preds_up = np.array([6, 9, 10, 12.5, 12])
    >>> eta = 0.01
    >>> confidence_level = 0.9
    >>> cwb = coverage_width_based(
    ... y_true, y_preds_low, y_preds_up, eta, confidence_level
    ... )
    >>> print(np.round(cwb ,2))
    0.69
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_pred_low = cast(NDArray, column_or_1d(y_pred_low))
    y_pred_up = cast(NDArray, column_or_1d(y_pred_up))

    _check_alpha(confidence_level)

    coverage_score = regression_coverage_score(
        y_true,
        np.column_stack((y_pred_low, y_pred_up)),
    )[0]
    mean_width = regression_mean_width_score(
        np.column_stack((y_pred_low, y_pred_up))[:, :, np.newaxis]
    )[0]
    ref_length = np.subtract(
        float(y_true.max()),
        float(y_true.min())
    )
    avg_length = mean_width / ref_length

    cwc = (1-avg_length)*np.exp(-eta * (coverage_score - confidence_level) ** 2)

    return float(cwc)


[docs]def regression_mwi_score(
        y_true: NDArray,
        y_pis: NDArray,
        confidence_level: float
) -> float:
    """
    The Winkler score, proposed by Winkler (1972), is a measure used to
    evaluate prediction intervals, combining the length of the interval
    with a penalty that increases proportionally to the distance of an
    observation outside the interval.

    Parameters
    ----------
    y_true: ArrayLike of shape (n_samples,)
        Ground truth values
    y_pis: ArrayLike of shape (n_samples, 2, 1)
        Lower and upper bounds of prediction intervals
        output from a MAPIE regressor
    confidence_level: float
        The value of confidence_level

    Returns
    -------
    float
        The mean Winkler interval score

    References
    ----------
    [1] Robert L. Winkler
    "A Decision-Theoretic Approach to Interval Estimation",
    Journal of the American Statistical Association,
    volume 67, pages 187-191 (1972)
    (https://doi.org/10.1080/01621459.1972.10481224)
    [2] Tilmann Gneiting and Adrian E Raftery
    "Strictly Proper Scoring Rules, Prediction, and Estimation",
    Journal of the American Statistical Association,
    volume 102, pages 359-378 (2007)
    (https://doi.org/10.1198/016214506000001437) (Section 6.2)
    """

    # Undo any possible quantile crossing
    y_pred_low = np.minimum(y_pis[:, 0, 0], y_pis[:, 1, 0])
    y_pred_up = np.maximum(y_pis[:, 0, 0], y_pis[:, 1, 0])

    _check_arrays_length(y_true, y_pred_low, y_pred_up)

    # Checking for NaN and inf values
    for array in (y_true, y_pred_low, y_pred_up):
        _check_array_nan(array)
        _check_array_inf(array)

    width = np.sum(y_pred_up) - np.sum(y_pred_low)  # type: ignore
    error_above: float = np.sum((y_true - y_pred_up)[y_true > y_pred_up])
    error_below: float = np.sum((y_pred_low - y_true)[y_true < y_pred_low])
    total_error = error_above + error_below
    mwi = (width + total_error * 2 / (1 - confidence_level)) / len(y_true)
    return mwi