Source code for mapie.conformity_scores.regression

import logging
from abc import ABCMeta, abstractmethod
from typing import Tuple, cast

import numpy as np
from numpy.typing import NDArray

from mapie._machine_precision import EPSILON
from mapie.conformity_scores.interface import BaseConformityScore
from mapie.estimator.regressor import EnsembleRegressor


[docs] class BaseRegressionScore(BaseConformityScore, metaclass=ABCMeta): """ Base conformity score class for regression task. This class should not be used directly. Use derived classes instead. Parameters ---------- sym: bool Whether to consider the conformity score as symmetrical or not. consistency_check: bool, optional Whether to check the consistency between the methods ``get_estimation_distribution`` and ``get_conformity_scores``. If ``True``, the following equality must be verified:: y == self.get_estimation_distribution( y_pred, self.get_conformity_scores(y, y_pred, **kwargs), **kwargs) By default ``True``. eps: float, optional Threshold to consider when checking the consistency between ``get_estimation_distribution`` and ``get_conformity_scores``. It should be specified if ``consistency_check==True``. By default, it is defined by the default precision. """
[docs] def __init__( self, sym: bool, consistency_check: bool = True, eps: float = float(EPSILON), ): super().__init__() self.sym = sym self.consistency_check = consistency_check self.eps = eps
[docs] @abstractmethod def get_signed_conformity_scores( self, y: NDArray, y_pred: NDArray, **kwargs ) -> NDArray: """ Placeholder for ``get_conformity_scores``. Subclasses should implement this method! Compute the sample conformity scores given the predicted and observed targets. Parameters ---------- y: NDArray of shape (n_samples,) Observed target values. y_pred: NDArray of shape (n_samples,) Predicted target values. Returns ------- NDArray of shape (n_samples,) Signed conformity scores. """
[docs] def get_conformity_scores(self, y: NDArray, y_pred: NDArray, **kwargs) -> NDArray: """ Get the conformity score considering the symmetrical property if so. Parameters ---------- y: NDArray of shape (n_samples,) Observed target values. y_pred: NDArray of shape (n_samples,) Predicted target values. Returns ------- NDArray of shape (n_samples,) Conformity scores. """ conformity_scores = self.get_signed_conformity_scores(y, y_pred, **kwargs) if self.consistency_check: self.check_consistency(y, y_pred, conformity_scores, **kwargs) if self.sym: conformity_scores = np.abs(conformity_scores) return conformity_scores
[docs] def check_consistency( self, y: NDArray, y_pred: NDArray, conformity_scores: NDArray, **kwargs ) -> None: """ Check consistency between the following methods: ``get_estimation_distribution`` and ``get_signed_conformity_scores`` The following equality should be verified:: y == self.get_estimation_distribution( y_pred, self.get_conformity_scores(y, y_pred, **kwargs), **kwargs) Parameters ---------- y: NDArray of shape (n_samples,) Observed target values. y_pred: NDArray of shape (n_samples,) Predicted target values. conformity_scores: NDArray of shape (n_samples,) Conformity scores. Raises ------ ValueError If the two methods are not consistent. """ score_distribution = self.get_estimation_distribution( y_pred, conformity_scores, **kwargs ) abs_conformity_scores = np.abs(np.subtract(score_distribution, y)) max_conf_score: float = np.max(abs_conformity_scores) if max_conf_score > self.eps: raise ValueError( "The two functions get_conformity_scores and " "get_estimation_distribution of the BaseRegressionScore class " "are not consistent. " "The following equation must be verified: " "self.get_estimation_distribution(y_pred, " "self.get_conformity_scores(y, y_pred)) == y. " f"The maximum conformity score is {max_conf_score}. " "The eps attribute may need to be increased if you are " "sure that the two methods are consistent." )
[docs] @abstractmethod def get_estimation_distribution( self, y_pred: NDArray, conformity_scores: NDArray, **kwargs ) -> NDArray: """ Placeholder for ``get_estimation_distribution``. Subclasses should implement this method! Compute samples of the estimation distribution given the predicted targets and the conformity scores. Parameters ---------- y_pred: NDArray of shape (n_samples,) Predicted target values. conformity_scores: NDArray of shape (n_samples,) Conformity scores. Returns ------- NDArray of shape (n_samples,) Observed values. """
@staticmethod def _beta_optimize( alpha_np: NDArray, upper_bounds: NDArray, lower_bounds: NDArray, ) -> NDArray: """ Minimize the width of the PIs, for a given difference of quantiles. Parameters ---------- alpha_np: NDArray The quantiles to compute. upper_bounds: NDArray of shape (n_samples,) The array of upper values. lower_bounds: NDArray of shape (n_samples,) The array of lower values. Returns ------- NDArray of shape (n_samples,) Array of betas minimizing the differences ``(1-alpha+beta)-quantile - beta-quantile``. """ # Using logging.warning instead of warnings.warn to avoid warnings during tests logging.warning( "The option to optimize beta (minimize interval width) is not working and " "needs to be fixed. See more details in " "https://github.com/scikit-learn-contrib/MAPIE/issues/588" ) beta_np = cast( NDArray[np.float64], np.full( shape=(len(lower_bounds), len(alpha_np)), fill_value=np.nan, dtype=float, ), ) for ind_alpha, _alpha in enumerate(alpha_np): betas = np.linspace( _alpha / (len(lower_bounds) + 1), _alpha, num=len(lower_bounds), endpoint=True, ) one_alpha_beta = np.nanquantile( upper_bounds.astype(float), 1 - _alpha + betas, axis=1, method="higher", ) beta = np.nanquantile( lower_bounds.astype(float), betas, axis=1, method="lower", ) beta_np[:, ind_alpha] = betas[np.argmin(one_alpha_beta - beta, axis=0)] return beta_np
[docs] def get_bounds( self, X: NDArray, alpha_np: NDArray, estimator: EnsembleRegressor, conformity_scores: NDArray, ensemble: bool = False, method: str = "base", optimize_beta: bool = False, allow_infinite_bounds: bool = False, ) -> Tuple[NDArray, NDArray, NDArray]: """ Compute bounds of the prediction intervals from the observed values, the estimator of type ``EnsembleRegressor`` and the conformity scores. Parameters ---------- X: NDArray of shape (n_samples, n_features) Observed feature values. alpha_np: NDArray of shape (n_alpha,) NDArray of floats between ``0`` and ``1``, represents the uncertainty of the confidence interval. estimator: EnsembleRegressor Estimator that is fitted to predict y from X. conformity_scores: NDArray of shape (n_samples,) Conformity scores. ensemble: bool Boolean determining whether the predictions are ensembled or not. By default ``False``. method: str Method to choose for prediction interval estimates. The ``"plus"`` method implies that the quantile is calculated after estimating the bounds, whereas the other methods (among the ``"naive"``, ``"base"`` or ``"minmax"`` methods, for example) do the opposite. By default ``base``. optimize_beta: bool Whether to optimize the PIs' width or not. By default ``False``. allow_infinite_bounds: bool Allow infinite prediction intervals to be produced. By default ``False``. Returns ------- Tuple[NDArray, NDArray, NDArray] - The predictions itself. (y_pred) of shape (n_samples,). - The lower bounds of the prediction intervals of shape (n_samples, n_alpha). - The upper bounds of the prediction intervals of shape (n_samples, n_alpha). Raises ------ ValueError If beta optimisation with symmetrical conformity score function. """ if self.sym and optimize_beta: raise ValueError( "Interval width minimization cannot be used with a " + "symmetrical conformity score function." ) y_pred, y_pred_low, y_pred_up = estimator.predict(X, ensemble) signed = -1 if self.sym else 1 if optimize_beta: beta_np = self._beta_optimize( alpha_np, conformity_scores.reshape(1, -1), conformity_scores.reshape(1, -1), ) else: beta_np = alpha_np / 2 if method == "plus": alpha_low = alpha_np if self.sym else beta_np alpha_up = 1 - alpha_np if self.sym else 1 - alpha_np + beta_np conformity_scores_low = self.get_estimation_distribution( y_pred_low, signed * conformity_scores, X=X ) conformity_scores_up = self.get_estimation_distribution( y_pred_up, conformity_scores, X=X ) bound_low = self.get_quantile( conformity_scores_low, alpha_low, axis=1, reversed=True, unbounded=allow_infinite_bounds, ) bound_up = self.get_quantile( conformity_scores_up, alpha_up, axis=1, unbounded=allow_infinite_bounds ) else: if self.sym: alpha_ref = 1 - alpha_np quantile_ref = self.get_quantile( conformity_scores[..., np.newaxis], alpha_ref, axis=0 ) quantile_low, quantile_up = -quantile_ref, quantile_ref else: alpha_low, alpha_up = beta_np, 1 - alpha_np + beta_np quantile_low = self.get_quantile( conformity_scores[..., np.newaxis], alpha_low, axis=0, reversed=True, unbounded=allow_infinite_bounds, ) quantile_up = self.get_quantile( conformity_scores[..., np.newaxis], alpha_up, axis=0, unbounded=allow_infinite_bounds, ) bound_low = self.get_estimation_distribution(y_pred_low, quantile_low, X=X) bound_up = self.get_estimation_distribution(y_pred_up, quantile_up, X=X) return y_pred, bound_low, bound_up
[docs] def predict_set(self, X: NDArray, alpha_np: NDArray, **kwargs): """ Compute the prediction sets on new samples based on the uncertainty of the target confidence set. Parameters ----------- X: NDArray of shape (n_samples,) The input data or samples for prediction. alpha_np: NDArray of shape (n_alpha, ) Represents the uncertainty of the confidence set to produce. **kwargs: dict Additional keyword arguments. Returns -------- The output structure depend on the ``get_bounds`` method. The prediction sets for each sample and each alpha level. """ return self.get_bounds(X=X, alpha_np=alpha_np, **kwargs)
[docs] def get_effective_calibration_samples(self, scores: NDArray): """ Calculates the effective number of calibration samples. Parameters ---------- scores: NDArray An array of scores. Returns ------- n: int The effective number of calibration samples. """ n: int = np.sum(~np.isnan(scores)) if not self.sym: n //= 2 return n