Source code for mapie.conformity_scores.regression

import logging
from abc import ABCMeta, abstractmethod
from typing import Tuple, cast

import numpy as np
from numpy.typing import NDArray

from mapie._machine_precision import EPSILON
from mapie.conformity_scores.interface import BaseConformityScore
from mapie.estimator.regressor import EnsembleRegressor



[docs]
class BaseRegressionScore(BaseConformityScore, metaclass=ABCMeta):
    """
    Base conformity score class for regression task.

    This class should not be used directly. Use derived classes instead.

    Parameters
    ----------
    sym: bool
        Whether to consider the conformity score as symmetrical or not.

    consistency_check: bool, optional
        Whether to check the consistency between the methods
        `get_estimation_distribution` and `get_conformity_scores`.
        If `True`, the following equality must be verified::

            y == self.get_estimation_distribution(
                y_pred,
                self.get_conformity_scores(y, y_pred, **kwargs),
                **kwargs)

        By default `True`.

    eps: float, optional
        Threshold to consider when checking the consistency
        between `get_estimation_distribution` and `get_conformity_scores`.
        It should be specified if `consistency_check==True`.

        By default, it is defined by the default precision.
    """


[docs]
    def __init__(
        self,
        sym: bool,
        consistency_check: bool = True,
        eps: float = float(EPSILON),
    ):
        super().__init__()
        self.sym = sym
        self.consistency_check = consistency_check
        self.eps = eps



[docs]
    @abstractmethod
    def get_signed_conformity_scores(
        self, y: NDArray, y_pred: NDArray, **kwargs
    ) -> NDArray:
        """
        Placeholder for `get_conformity_scores`.
        Subclasses should implement this method!

        Compute the sample conformity scores given the predicted and
        observed targets.

        Parameters
        ----------
        y: NDArray of shape (n_samples,)
            Observed target values.

        y_pred: NDArray of shape (n_samples,)
            Predicted target values.

        Returns
        -------
        NDArray of shape (n_samples,)
            Signed conformity scores.
        """



[docs]
    def get_conformity_scores(self, y: NDArray, y_pred: NDArray, **kwargs) -> NDArray:
        """
        Get the conformity score considering the symmetrical property if so.

        Parameters
        ----------
        y: NDArray of shape (n_samples,)
            Observed target values.

        y_pred: NDArray of shape (n_samples,)
            Predicted target values.

        Returns
        -------
        NDArray of shape (n_samples,)
            Conformity scores.
        """
        conformity_scores = self.get_signed_conformity_scores(y, y_pred, **kwargs)

        if self.consistency_check:
            self.check_consistency(y, y_pred, conformity_scores, **kwargs)
        if self.sym:
            conformity_scores = np.abs(conformity_scores)
        return conformity_scores



[docs]
    def check_consistency(
        self, y: NDArray, y_pred: NDArray, conformity_scores: NDArray, **kwargs
    ) -> None:
        """
        Check consistency between the following methods:
        `get_estimation_distribution` and `get_signed_conformity_scores`

        The following equality should be verified::

            y == self.get_estimation_distribution(
                y_pred,
                self.get_conformity_scores(y, y_pred, **kwargs),
                **kwargs)

        Parameters
        ----------
        y: NDArray of shape (n_samples,)
            Observed target values.

        y_pred: NDArray of shape (n_samples,)
            Predicted target values.

        conformity_scores: NDArray of shape (n_samples,)
            Conformity scores.

        Raises
        ------
        ValueError
            If the two methods are not consistent.
        """
        score_distribution = self.get_estimation_distribution(
            y_pred, conformity_scores, **kwargs
        )
        abs_conformity_scores = np.abs(np.subtract(score_distribution, y))
        max_conf_score: float = np.max(abs_conformity_scores)
        if max_conf_score > self.eps:
            raise ValueError(
                "The two functions get_conformity_scores and "
                "get_estimation_distribution of the BaseRegressionScore class "
                "are not consistent. "
                "The following equation must be verified: "
                "self.get_estimation_distribution(y_pred, "
                "self.get_conformity_scores(y, y_pred)) == y. "
                f"The maximum conformity score is {max_conf_score}. "
                "The eps attribute may need to be increased if you are "
                "sure that the two methods are consistent."
            )



[docs]
    @abstractmethod
    def get_estimation_distribution(
        self, y_pred: NDArray, conformity_scores: NDArray, **kwargs
    ) -> NDArray:
        """
        Placeholder for `get_estimation_distribution`.
        Subclasses should implement this method!

        Compute samples of the estimation distribution given the predicted
        targets and the conformity scores.

        Parameters
        ----------
        y_pred: NDArray of shape (n_samples,)
            Predicted target values.

        conformity_scores: NDArray of shape (n_samples,)
            Conformity scores.

        Returns
        -------
        NDArray of shape (n_samples,)
            Observed values.
        """


    @staticmethod
    def _beta_optimize(
        alpha_np: NDArray,
        upper_bounds: NDArray,
        lower_bounds: NDArray,
    ) -> NDArray:
        """
        Minimize the width of the PIs, for a given difference of quantiles.

        Parameters
        ----------
        alpha_np: NDArray
            The quantiles to compute.

        upper_bounds: NDArray of shape (n_samples,)
            The array of upper values.

        lower_bounds: NDArray of shape (n_samples,)
            The array of lower values.

        Returns
        -------
        NDArray of shape (n_samples,)
            Array of betas minimizing the differences
            `(1-alpha+beta)-quantile - beta-quantile`.
        """
        # Using logging.warning instead of warnings.warn to avoid warnings during tests
        logging.warning(
            "The option to optimize beta (minimize interval width) is not working and "
            "needs to be fixed. See more details in "
            "https://github.com/scikit-learn-contrib/MAPIE/issues/588"
        )

        beta_np = cast(
            NDArray[np.float64],
            np.full(
                shape=(len(lower_bounds), len(alpha_np)),
                fill_value=np.nan,
                dtype=float,
            ),
        )
        for ind_alpha, _alpha in enumerate(alpha_np):
            betas = np.linspace(
                _alpha / (len(lower_bounds) + 1),
                _alpha,
                num=len(lower_bounds),
                endpoint=True,
            )
            one_alpha_beta = np.nanquantile(
                upper_bounds.astype(float),
                1 - _alpha + betas,
                axis=1,
                method="higher",
            )
            beta = np.nanquantile(
                lower_bounds.astype(float),
                betas,
                axis=1,
                method="lower",
            )
            beta_np[:, ind_alpha] = betas[np.argmin(one_alpha_beta - beta, axis=0)]

        return beta_np


[docs]
    def get_bounds(
        self,
        X: NDArray,
        alpha_np: NDArray,
        estimator: EnsembleRegressor,
        conformity_scores: NDArray,
        ensemble: bool = False,
        method: str = "base",
        optimize_beta: bool = False,
        allow_infinite_bounds: bool = False,
    ) -> Tuple[NDArray, NDArray, NDArray]:
        """
        Compute bounds of the prediction intervals from the observed values,
        the estimator of type `EnsembleRegressor` and the conformity scores.

        Parameters
        ----------
        X: NDArray of shape (n_samples, n_features)
            Observed feature values.

        alpha_np: NDArray of shape (n_alpha,)
            NDArray of floats between `0` and `1`, represents the
            uncertainty of the confidence interval.

        estimator: EnsembleRegressor
            Estimator that is fitted to predict y from X.

        conformity_scores: NDArray of shape (n_samples,)
            Conformity scores.

        ensemble: bool
            Boolean determining whether the predictions are ensembled or not.

            By default `False`.

        method: str
            Method to choose for prediction interval estimates.
            The `"plus"` method implies that the quantile is calculated
            after estimating the bounds, whereas the other methods
            (among the `"naive"`, `"base"` or `"minmax"` methods,
            for example) do the opposite.

            By default `base`.

        optimize_beta: bool
            Whether to optimize the PIs' width or not.

            By default `False`.

        allow_infinite_bounds: bool
            Allow infinite prediction intervals to be produced.

            By default `False`.

        Returns
        -------
        Tuple[NDArray, NDArray, NDArray]
            - The predictions itself. (y_pred) of shape (n_samples,).
            - The lower bounds of the prediction intervals of shape
              (n_samples, n_alpha).
            - The upper bounds of the prediction intervals of shape
              (n_samples, n_alpha).

        Raises
        ------
        ValueError
            If beta optimisation with symmetrical conformity score function.
        """
        if self.sym and optimize_beta:
            raise ValueError(
                "Interval width minimization cannot be used with a "
                + "symmetrical conformity score function."
            )

        y_pred, y_pred_low, y_pred_up = estimator.predict(X, ensemble)
        signed = -1 if self.sym else 1

        if optimize_beta:
            beta_np = self._beta_optimize(
                alpha_np,
                conformity_scores.reshape(1, -1),
                conformity_scores.reshape(1, -1),
            )
        else:
            beta_np = alpha_np / 2

        if method == "plus":
            alpha_low = alpha_np if self.sym else beta_np
            alpha_up = 1 - alpha_np if self.sym else 1 - alpha_np + beta_np

            conformity_scores_low = self.get_estimation_distribution(
                y_pred_low, signed * conformity_scores, X=X
            )
            conformity_scores_up = self.get_estimation_distribution(
                y_pred_up, conformity_scores, X=X
            )
            bound_low = self.get_quantile(
                conformity_scores_low,
                alpha_low,
                axis=1,
                reversed=True,
                unbounded=allow_infinite_bounds,
            )
            bound_up = self.get_quantile(
                conformity_scores_up, alpha_up, axis=1, unbounded=allow_infinite_bounds
            )

        else:
            if self.sym:
                alpha_ref = 1 - alpha_np
                quantile_ref = self.get_quantile(
                    conformity_scores[..., np.newaxis], alpha_ref, axis=0
                )
                quantile_low, quantile_up = -quantile_ref, quantile_ref

            else:
                alpha_low, alpha_up = beta_np, 1 - alpha_np + beta_np

                quantile_low = self.get_quantile(
                    conformity_scores[..., np.newaxis],
                    alpha_low,
                    axis=0,
                    reversed=True,
                    unbounded=allow_infinite_bounds,
                )
                quantile_up = self.get_quantile(
                    conformity_scores[..., np.newaxis],
                    alpha_up,
                    axis=0,
                    unbounded=allow_infinite_bounds,
                )

            bound_low = self.get_estimation_distribution(y_pred_low, quantile_low, X=X)
            bound_up = self.get_estimation_distribution(y_pred_up, quantile_up, X=X)

        return y_pred, bound_low, bound_up



[docs]
    def predict_set(self, X: NDArray, alpha_np: NDArray, **kwargs):
        """
        Compute the prediction sets on new samples based on the uncertainty of
        the target confidence set.

        Parameters
        -----------
        X: NDArray of shape (n_samples,)
            The input data or samples for prediction.

        alpha_np: NDArray of shape (n_alpha, )
            Represents the uncertainty of the confidence set to produce.

        **kwargs: dict
            Additional keyword arguments.

        Returns
        --------
        result
            The prediction sets for each sample and each alpha level.
            The output structure depends on the `get_bounds` method.
        """
        return self.get_bounds(X=X, alpha_np=alpha_np, **kwargs)



[docs]
    def get_effective_calibration_samples(self, scores: NDArray):
        """
        Calculates the effective number of calibration samples.

        Parameters
        ----------
        scores: NDArray
            An array of scores.

        Returns
        -------
        n: int
            The effective number of calibration samples.
        """
        n: int = np.sum(~np.isnan(scores))
        if not self.sym:
            n //= 2
        return n