Source code for mapie.regression.regression

from __future__ import annotations

from typing import Any, Iterable, Optional, Tuple, Union, cast

import numpy as np
from numpy.typing import ArrayLike, NDArray
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import BaseCrossValidator
from sklearn.pipeline import Pipeline
from sklearn.utils import check_random_state
from sklearn.utils.validation import _check_y, indexable

from mapie.conformity_scores import BaseRegressionScore, ResidualNormalisedScore
from mapie.conformity_scores.utils import (
    check_and_select_conformity_score,
    check_regression_conformity_score,
)
from mapie.estimator.regressor import EnsembleRegressor
from mapie.subsample import Subsample
from mapie.utils import (
    _cast_point_predictions_to_ndarray,
    _cast_predictions_to_ndarray_tuple,
    _check_alpha,
    _check_alpha_and_n_samples,
    _check_cv,
    _check_cv_not_string,
    _check_estimator_fit_predict,
    _check_if_param_in_allowed_values,
    _check_n_features_in,
    _check_n_jobs,
    _check_null_weight,
    _check_predict_params,
    _check_verbose,
    _fit_estimator,
    _prepare_params,
    _raise_error_if_fit_called_in_prefit_mode,
    _raise_error_if_method_already_called,
    _raise_error_if_previous_method_not_called,
    _transform_confidence_level_to_alpha_list,
    check_is_fitted,
    check_sklearn_user_model_is_fitted,
)


[docs] class SplitConformalRegressor: """ Computes prediction intervals using the split conformal regression technique: 1. The `fit` method (optional) fits the base regressor to the training data. 2. The `conformalize` method estimates the uncertainty of the base regressor by computing conformity scores on the conformalization set. 3. The `predict_interval` method predicts points and intervals. Parameters ---------- estimator : RegressorMixin, default=LinearRegression() The base regressor used to predict points. confidence_level : Union[float, List[float]], default=0.9 The confidence level(s) for the prediction intervals, indicating the desired coverage probability of the prediction intervals. If a float is provided, it represents a single confidence level. If a list, multiple prediction intervals for each specified confidence level are returned. conformity_score : Union[str, BaseRegressionScore], default="absolute" The method used to compute conformity scores Valid options: - "absolute" - "gamma" - "residual_normalized" - Any subclass of BaseRegressionScore A custom score function inheriting from BaseRegressionScore may also be provided. See [theoretical description (conformity scores)](../theory/conformity-scores.md). prefit : bool, default=True If True, the base regressor must be fitted, and the `fit` method must be skipped. If False, the base regressor will be fitted during the `fit` method. n_jobs : Optional[int], default=None The number of jobs to run in parallel when applicable. verbose : int, default=0 Controls the verbosity level. Higher values increase the output details. Examples -------- >>> from mapie.regression import SplitConformalRegressor >>> from mapie.utils import train_conformalize_test_split >>> from sklearn.datasets import make_regression >>> from sklearn.linear_model import Ridge >>> X, y = make_regression(n_samples=500, n_features=2, noise=1.0) >>> ( ... X_train, X_conformalize, X_test, ... y_train, y_conformalize, y_test ... ) = train_conformalize_test_split( ... X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1 ... ) >>> mapie_regressor = SplitConformalRegressor( ... estimator=Ridge(), ... confidence_level=0.95, ... prefit=False, ... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize) >>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test) """
[docs] def __init__( self, estimator: RegressorMixin = LinearRegression(), confidence_level: Union[float, Iterable[float]] = 0.9, conformity_score: Union[str, BaseRegressionScore] = "absolute", prefit: bool = True, n_jobs: Optional[int] = None, verbose: int = 0, ) -> None: _check_estimator_fit_predict(estimator) self._estimator = estimator self._prefit = prefit self._is_fitted = prefit self._is_conformalized = False self._conformity_score = check_and_select_conformity_score( conformity_score, BaseRegressionScore, ) # Note to developers: to implement this v1 class without touching the # v0 backend, we're for now using a hack. We always set cv="prefit", # and we fit the estimator if needed. See the .fit method below. self._mapie_regressor = _MapieRegressor( estimator=self._estimator, method="base", cv="prefit", n_jobs=n_jobs, verbose=verbose, conformity_score=self._conformity_score, ) self._alphas = _transform_confidence_level_to_alpha_list(confidence_level) self._predict_params: dict = {}
[docs] def fit( self, X_train: ArrayLike, y_train: ArrayLike, fit_params: Optional[dict] = None, ) -> SplitConformalRegressor: """ Fits the base regressor to the training data. Parameters ---------- X_train : ArrayLike Training data features. y_train : ArrayLike Training data targets. fit_params : Optional[dict], default=None Parameters to pass to the `fit` method of the base regressor. Returns ------- Self The fitted SplitConformalRegressor instance. """ _raise_error_if_fit_called_in_prefit_mode(self._prefit) _raise_error_if_method_already_called("fit", self._is_fitted) cloned_estimator = clone(self._estimator) fit_params_ = _prepare_params(fit_params) _fit_estimator(cloned_estimator, X_train, y_train, **fit_params_) self._mapie_regressor.estimator = cloned_estimator self._is_fitted = True return self
[docs] def conformalize( self, X_conformalize: ArrayLike, y_conformalize: ArrayLike, predict_params: Optional[dict] = None, ) -> SplitConformalRegressor: """ Estimates the uncertainty of the base regressor by computing conformity scores on the conformalization set. Parameters ---------- X_conformalize : ArrayLike Features of the conformalization set. y_conformalize : ArrayLike Targets of the conformalization set. predict_params : Optional[dict], default=None Parameters to pass to the `predict` method of the base regressor. These parameters will also be used in the `predict_interval` and `predict` methods of this SplitConformalRegressor. Returns ------- Self The conformalized SplitConformalRegressor instance. """ _raise_error_if_previous_method_not_called( "conformalize", "fit", self._is_fitted, ) _raise_error_if_method_already_called( "conformalize", self._is_conformalized, ) self._predict_params = _prepare_params(predict_params) self._mapie_regressor.fit( X_conformalize, y_conformalize, predict_params=self._predict_params ) self._is_conformalized = True return self
[docs] def predict_interval( self, X: ArrayLike, minimize_interval_width: bool = False, allow_infinite_bounds: bool = False, ) -> Tuple[NDArray, NDArray]: """ Predicts points (using the base regressor) and intervals. If several confidence levels were provided during initialisation, several intervals will be predicted for each sample. See the return signature. Parameters ---------- X : ArrayLike Features minimize_interval_width : bool, default=False If True, attempts to minimize the intervals width. allow_infinite_bounds : bool, default=False If True, allows prediction intervals with infinite bounds. Returns ------- Tuple[NDArray, NDArray] Two arrays: - Prediction points, of shape `(n_samples,)` - Prediction intervals, of shape `(n_samples, 2, n_confidence_levels)` """ _raise_error_if_previous_method_not_called( "predict_interval", "conformalize", self._is_conformalized, ) predictions = self._mapie_regressor.predict( X, alpha=self._alphas, optimize_beta=minimize_interval_width, allow_infinite_bounds=allow_infinite_bounds, **self._predict_params, ) return _cast_predictions_to_ndarray_tuple(predictions)
[docs] def predict( self, X: ArrayLike, ) -> NDArray: """ Predicts points. Parameters ---------- X : ArrayLike Features Returns ------- NDArray Array of point predictions, with shape (n_samples,). """ _raise_error_if_previous_method_not_called( "predict", "conformalize", self._is_conformalized, ) predictions = self._mapie_regressor.predict( X, alpha=None, **self._predict_params ) return _cast_point_predictions_to_ndarray(predictions)
[docs] class CrossConformalRegressor: """ Computes prediction intervals using the cross conformal regression technique: 1. The `fit_conformalize` method estimates the uncertainty of the base regressor in a cross-validation style. It fits the base regressor on folds of the dataset and computes conformity scores on the out-of-fold data. 2. The `predict_interval` computes prediction points and intervals. Parameters ---------- estimator : RegressorMixin, default=LinearRegression() The base regressor used to predict points. confidence_level : Union[float, List[float]], default=0.9 The confidence level(s) for the prediction intervals, indicating the desired coverage probability of the prediction intervals. If a float is provided, it represents a single confidence level. If a list, multiple prediction intervals for each specified confidence level are returned. conformity_score : Union[str, BaseRegressionScore], default="absolute" The method used to compute conformity scores Valid options: - "absolute" - "gamma" - The corresponding subclasses of BaseRegressionScore A custom score function inheriting from BaseRegressionScore may also be provided. See [theoretical description (conformity scores)](../theory/conformity-scores.md). method : str, default="plus" The method used to compute prediction intervals. Options are: - "base": Based on the conformity scores from each fold. - "plus": Based on the conformity scores from each fold and the test set predictions. - "minmax": Based on the conformity scores from each fold and the test set predictions, using the minimum and maximum among each fold models. cv : Union[int, BaseCrossValidator], default=5 The cross-validator used to compute conformity scores. Valid options: - integer, to specify the number of folds - any `sklearn.model_selection.BaseCrossValidator` suitable for regression, or a custom cross-validator inheriting from it. Main variants in the cross conformal setting are: - `sklearn.model_selection.KFold` (vanilla cross conformal) - `sklearn.model_selection.LeaveOneOut` (jackknife) n_jobs : Optional[int], default=None The number of jobs to run in parallel when applicable. verbose : int, default=0 Controls the verbosity level. Higher values increase the output details. random_state : Optional[Union[int, np.random.RandomState]], default=None A seed or random state instance to ensure reproducibility in any random operations within the regressor. Examples -------- >>> from mapie.regression import CrossConformalRegressor >>> from sklearn.datasets import make_regression >>> from sklearn.model_selection import train_test_split >>> from sklearn.linear_model import Ridge >>> X_full, y_full = make_regression(n_samples=500,n_features=2,noise=1.0) >>> X, X_test, y, y_test = train_test_split(X_full, y_full) >>> mapie_regressor = CrossConformalRegressor( ... estimator=Ridge(), ... confidence_level=0.95, ... cv=10 ... ).fit_conformalize(X, y) >>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test) """ _VALID_METHODS = ["base", "plus", "minmax"]
[docs] def __init__( self, estimator: RegressorMixin = LinearRegression(), confidence_level: Union[float, Iterable[float]] = 0.9, conformity_score: Union[str, BaseRegressionScore] = "absolute", method: str = "plus", cv: Union[int, BaseCrossValidator] = 5, n_jobs: Optional[int] = None, verbose: int = 0, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: _check_if_param_in_allowed_values( method, "method", CrossConformalRegressor._VALID_METHODS ) _check_cv_not_string(cv) self._mapie_regressor = _MapieRegressor( estimator=estimator, method=method, cv=cv, n_jobs=n_jobs, verbose=verbose, conformity_score=check_and_select_conformity_score( conformity_score, BaseRegressionScore, ), random_state=random_state, ) self._alphas = _transform_confidence_level_to_alpha_list(confidence_level) self.is_fitted_and_conformalized = False self._predict_params: dict = {}
[docs] def fit_conformalize( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, fit_params: Optional[dict] = None, predict_params: Optional[dict] = None, ) -> CrossConformalRegressor: """ Estimates the uncertainty of the base regressor in a cross-validation style: fits the base regressor on different folds of the dataset and computes conformity scores on the corresponding out-of-fold data. Parameters ---------- X : ArrayLike Features y : ArrayLike Targets groups: Optional[ArrayLike] of shape (n_samples,), default=None Groups to pass to the cross-validator. fit_params : Optional[dict], default=None Parameters to pass to the `fit` method of the base regressor. predict_params : Optional[dict], default=None Parameters to pass to the `predict` method of the base regressor. These parameters will also be used in the `predict_interval` and `predict` methods of this CrossConformalRegressor. Returns ------- Self This CrossConformalRegressor instance, fitted and conformalized. """ _raise_error_if_method_already_called( "fit_conformalize", self.is_fitted_and_conformalized, ) fit_params_ = _prepare_params(fit_params) self._predict_params = _prepare_params(predict_params) self._mapie_regressor.fit( X, y, groups=groups, fit_params=fit_params_, predict_params=self._predict_params, ) self.is_fitted_and_conformalized = True return self
[docs] def predict_interval( self, X: ArrayLike, aggregate_predictions: Optional[str] = "mean", minimize_interval_width: bool = False, allow_infinite_bounds: bool = False, ) -> Tuple[NDArray, NDArray]: """ Predicts points and intervals. If several confidence levels were provided during initialisation, several intervals will be predicted for each sample. See the return signature. By default, points are predicted using an aggregation. See the `ensemble` parameter. Parameters ---------- X : ArrayLike Features aggregate_predictions : Optional[str], default="mean" The method to predict a point. Options: - None: a point is predicted using the regressor trained on the entire data - "mean": Averages the predictions of the regressors trained on each cross-validation fold - "median": Aggregates (using median) the predictions of the regressors trained on each cross-validation fold minimize_interval_width : bool, default=False If True, attempts to minimize the interval width. allow_infinite_bounds : bool, default=False If True, allows prediction intervals with infinite bounds. Returns ------- Tuple[NDArray, NDArray] Two arrays: - Prediction points, of shape `(n_samples,)` - Prediction intervals, of shape `(n_samples, 2, n_confidence_levels)` """ _raise_error_if_previous_method_not_called( "predict_interval", "fit_conformalize", self.is_fitted_and_conformalized, ) ensemble = self._set_aggregate_predictions_and_return_ensemble( aggregate_predictions ) predictions = self._mapie_regressor.predict( X, alpha=self._alphas, optimize_beta=minimize_interval_width, allow_infinite_bounds=allow_infinite_bounds, ensemble=ensemble, **self._predict_params, ) return _cast_predictions_to_ndarray_tuple(predictions)
[docs] def predict( self, X: ArrayLike, aggregate_predictions: Optional[str] = "mean", ) -> NDArray: """ Predicts points. By default, points are predicted using an aggregation. See the `ensemble` parameter. Parameters ---------- X : ArrayLike Features aggregate_predictions : Optional[str], default="mean" The method to predict a point. Options: - None: a point is predicted using the regressor trained on the entire data - "mean": Averages the predictions of the regressors trained on each cross-validation fold - "median": Aggregates (using median) the predictions of the regressors trained on each cross-validation fold Returns ------- NDArray Array of point predictions, with shape `(n_samples,)`. """ _raise_error_if_previous_method_not_called( "predict", "fit_conformalize", self.is_fitted_and_conformalized, ) ensemble = self._set_aggregate_predictions_and_return_ensemble( aggregate_predictions ) predictions = self._mapie_regressor.predict( X, alpha=None, ensemble=ensemble, **self._predict_params, ) return _cast_point_predictions_to_ndarray(predictions)
def _set_aggregate_predictions_and_return_ensemble( self, aggregate_predictions: Optional[str] ) -> bool: if not aggregate_predictions: ensemble = False else: ensemble = True self._mapie_regressor._check_agg_function(aggregate_predictions) # A hack here, to allow choosing the aggregation function at prediction time self._mapie_regressor.agg_function = aggregate_predictions return ensemble
[docs] class JackknifeAfterBootstrapRegressor: """ Computes prediction intervals using the jackknife-after-bootstrap technique: 1. The `fit_conformalize` method estimates the uncertainty of the base regressor using bootstrap sampling. It fits the base regressor on samples of the dataset and computes conformity scores on the out-of-sample data. 2. The `predict_interval` computes prediction points and intervals. Parameters ---------- estimator : RegressorMixin, default=LinearRegression() The base regressor used to predict points. confidence_level : Union[float, List[float]], default=0.9 The confidence level(s) for the prediction intervals, indicating the desired coverage probability of the prediction intervals. If a float is provided, it represents a single confidence level. If a list, multiple prediction intervals for each specified confidence level are returned. conformity_score : Union[str, BaseRegressionScore], default="absolute" The method used to compute conformity scores Valid options: - "absolute" - "gamma" - The corresponding subclasses of BaseRegressionScore A custom score function inheriting from BaseRegressionScore may also be provided. See [theoretical description (conformity scores)](../theory/conformity-scores.md). method : str, default="plus" The method used to compute prediction intervals. Options are: - "plus": Based on the conformity scores from each bootstrap sample and the testing prediction. - "minmax": Based on the minimum and maximum conformity scores from each bootstrap sample. Note: The "base" method is not mentioned in the conformal inference literature for Jackknife after bootstrap strategies, hence not provided here. resampling : Union[int, Subsample], default=30 Number of bootstrap resamples or an instance of `Subsample` for custom sampling strategy. aggregation_method : str, default="mean" Aggregation method for predictions across bootstrap samples. Options: - "mean" - "median" n_jobs : Optional[int], default=None The number of jobs to run in parallel when applicable. verbose : int, default=0 Controls the verbosity level. Higher values increase the output details. random_state : Optional[Union[int, np.random.RandomState]], default=None A seed or random state instance to ensure reproducibility in any random operations within the regressor. Examples -------- >>> from mapie.regression import JackknifeAfterBootstrapRegressor >>> from sklearn.datasets import make_regression >>> from sklearn.model_selection import train_test_split >>> from sklearn.linear_model import Ridge >>> X_full, y_full = make_regression(n_samples=500,n_features=2,noise=1.0) >>> X, X_test, y, y_test = train_test_split(X_full, y_full) >>> mapie_regressor = JackknifeAfterBootstrapRegressor( ... estimator=Ridge(), ... confidence_level=0.95, ... resampling=25, ... ).fit_conformalize(X, y) >>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test) """ _VALID_METHODS = ["plus", "minmax"] _VALID_AGGREGATION_METHODS = ["mean", "median"]
[docs] def __init__( self, estimator: RegressorMixin = LinearRegression(), confidence_level: Union[float, Iterable[float]] = 0.9, conformity_score: Union[str, BaseRegressionScore] = "absolute", method: str = "plus", resampling: Union[int, Subsample] = 30, aggregation_method: str = "mean", n_jobs: Optional[int] = None, verbose: int = 0, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: _check_if_param_in_allowed_values( method, "method", JackknifeAfterBootstrapRegressor._VALID_METHODS ) _check_if_param_in_allowed_values( aggregation_method, "aggregation_method", JackknifeAfterBootstrapRegressor._VALID_AGGREGATION_METHODS, ) cv = self._check_and_convert_resampling_to_cv(resampling) self._mapie_regressor = _MapieRegressor( estimator=estimator, method=method, cv=cv, n_jobs=n_jobs, verbose=verbose, agg_function=aggregation_method, conformity_score=check_and_select_conformity_score( conformity_score, BaseRegressionScore, ), random_state=random_state, ) self._alphas = _transform_confidence_level_to_alpha_list(confidence_level) self.is_fitted_and_conformalized = False self._predict_params: dict = {}
[docs] def fit_conformalize( self, X: ArrayLike, y: ArrayLike, fit_params: Optional[dict] = None, predict_params: Optional[dict] = None, ) -> JackknifeAfterBootstrapRegressor: """ Estimates the uncertainty of the base regressor using bootstrap sampling: fits the base regressor on (potentially overlapping) samples of the dataset, and computes conformity scores on the corresponding out of samples data. Parameters ---------- X : ArrayLike Features. Must be the same X used in .fit y : ArrayLike Targets. Must be the same y used in .fit fit_params : Optional[dict], default=None Parameters to pass to the `fit` method of the base regressor. predict_params : Optional[dict], default=None Parameters to pass to the `predict` method of the base regressor. These parameters will also be used in the `predict_interval` and `predict` methods of this JackknifeAfterBootstrapRegressor. Returns ------- Self This JackknifeAfterBootstrapRegressor instance, fitted and conformalized. """ _raise_error_if_method_already_called( "fit_conformalize", self.is_fitted_and_conformalized, ) fit_params_ = _prepare_params(fit_params) self._predict_params = _prepare_params(predict_params) self._mapie_regressor.fit( X, y, fit_params=fit_params_, predict_params=self._predict_params, ) self.is_fitted_and_conformalized = True return self
[docs] def predict_interval( self, X: ArrayLike, ensemble: bool = True, minimize_interval_width: bool = False, allow_infinite_bounds: bool = False, ) -> Tuple[NDArray, NDArray]: """ Predicts points and intervals. If several confidence levels were provided during initialisation, several intervals will be predicted for each sample. See the return signature. By default, points are predicted using an aggregation. See the `ensemble` parameter. Parameters ---------- X : ArrayLike Test data for prediction intervals. ensemble : bool, default=True If True, a predicted point is an aggregation of the predictions of the regressors trained on each bootstrap samples. This aggregation depends on the `aggregation_method` provided during initialisation. If False, a point is predicted using the regressor trained on the entire data minimize_interval_width : bool, default=False If True, attempts to minimize the interval width. allow_infinite_bounds : bool, default=False If True, allows prediction intervals with infinite bounds. Returns ------- Tuple[NDArray, NDArray] Two arrays: - Prediction points, of shape `(n_samples,)` - Prediction intervals, of shape `(n_samples, 2, n_confidence_levels)` """ _raise_error_if_previous_method_not_called( "predict_interval", "fit_conformalize", self.is_fitted_and_conformalized, ) predictions = self._mapie_regressor.predict( X, alpha=self._alphas, optimize_beta=minimize_interval_width, allow_infinite_bounds=allow_infinite_bounds, ensemble=ensemble, **self._predict_params, ) return _cast_predictions_to_ndarray_tuple(predictions)
[docs] def predict( self, X: ArrayLike, ensemble: bool = True, ) -> NDArray: """ Predicts points. By default, points are predicted using an aggregation. See the `ensemble` parameter. Parameters ---------- X : ArrayLike Data features for generating point predictions. ensemble : bool, default=True If True, a predicted point is an aggregation of the predictions of the regressors trained on each bootstrap samples. This aggregation depends on the `aggregation_method` provided during initialisation. If False, a point is predicted using the regressor trained on the entire data Returns ------- NDArray Array of point predictions, with shape `(n_samples,)`. """ _raise_error_if_previous_method_not_called( "predict", "fit_conformalize", self.is_fitted_and_conformalized, ) predictions = self._mapie_regressor.predict( X, alpha=None, ensemble=ensemble, **self._predict_params, ) return _cast_point_predictions_to_ndarray(predictions)
@staticmethod def _check_and_convert_resampling_to_cv( resampling: Union[int, Subsample], ) -> Subsample: if isinstance(resampling, int): cv = Subsample(n_resamplings=resampling) elif isinstance(resampling, Subsample): cv = resampling else: raise ValueError("resampling must be an integer or a Subsample instance") return cv
class _MapieRegressor(RegressorMixin, BaseEstimator): """ Note to users: _MapieRegressor is now private, and may change at any time. Please use CrossConformalRegressor, CrossConformalRegressor or JackknifeAfterBootstrapRegressor instead. See the v1 release notes for more information. Prediction interval with out-of-fold conformity scores. This class implements the jackknife+ strategy and its variations for estimating prediction intervals on single-output data. The idea is to evaluate out-of-fold conformity scores (signed residuals, absolute residuals, residuals normalized by the predicted mean...) on hold-out validation sets and to deduce valid confidence intervals with strong theoretical guarantees. Parameters ---------- estimator: Optional[RegressorMixin] Any regressor with scikit-learn API (i.e. with `fit` and `predict` methods). If `None`, estimator defaults to a `LinearRegression` instance. By default `None`. method: str Method to choose for prediction interval estimates. Choose among: - `"naive"`, based on training set conformity scores, - `"base"`, based on validation sets conformity scores, - `"plus"`, based on validation conformity scores and testing predictions, - `"minmax"`, based on validation conformity scores and testing predictions (min/max among cross-validation clones). By default `"plus"`. cv: Optional[Union[int, str, BaseCrossValidator]] The cross-validation strategy for computing conformity scores. It directly drives the distinction between jackknife and cv variants. Choose among: - `None`, to use the default 5-fold cross-validation - integer, to specify the number of folds. If equal to `-1`, equivalent to `sklearn.model_selection.LeaveOneOut()`. - CV splitter: any `sklearn.model_selection.BaseCrossValidator` Main variants are: - `sklearn.model_selection.LeaveOneOut` (jackknife), - `sklearn.model_selection.KFold` (cross-validation), - `subsample.Subsample` object (bootstrap). - `"split"`, does not involve cross-validation but a division of the data into training and calibration subsets. The splitter used is the following: `sklearn.model_selection.ShuffleSplit`. `method` parameter is set to `"base"`. - `"prefit"`, assumes that `estimator` has been fitted already, and the `method` parameter is set to `"base"`. All data provided in the `fit` method is then used for computing conformity scores only. At prediction time, quantiles of these conformity scores are used to provide a prediction interval with fixed width. The user has to take care manually that data for model fitting and conformity scores estimate are disjoint. By default `None`. test_size: Optional[Union[int, float]] If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the test split. If `int`, represents the absolute number of test samples. If `None`, it will be set to `0.1`. If cv is not `"split"`, `test_size` is ignored. By default `None`. n_jobs: Optional[int] Number of jobs for parallel processing using joblib via the "locky" backend. If `-1` all CPUs are used. If `1` is given, no parallel computing code is used at all, which is useful for debugging. For `n_jobs` below `-1`, `(n_cpus + 1 - n_jobs)` are used. `None` is a marker for `unset` that will be interpreted as `n_jobs=1` (sequential execution). By default `None`. agg_function: Optional[str] Determines how to aggregate predictions from perturbed models, both at training and prediction time. If `None`, it is ignored except if `cv` class is `Subsample`, in which case an error is raised. If `"mean"` or `"median"`, returns the mean or median of the predictions computed from the out-of-folds models. Note: if you plan to set the `ensemble` argument to `True` in the `predict` method, you have to specify an aggregation function. Otherwise an error would be raised. The Jackknife+ interval can be interpreted as an interval around the median prediction, and is guaranteed to lie inside the interval, unlike the single estimator predictions. When the cross-validation strategy is `Subsample` (i.e. for the Jackknife+-after-Bootstrap method), this function is also used to aggregate the training set in-sample predictions. If `cv` is `"prefit"` or `"split"`, `agg_function` is ignored. By default `"mean"`. verbose: int The verbosity level, used with joblib for multiprocessing. The frequency of the messages increases with the verbosity level. If it more than `10`, all iterations are reported. Above `50`, the output is sent to stdout. By default `0`. conformity_score: Optional[BaseRegressionScore] BaseRegressionScore instance. It defines the link between the observed values, the predicted ones and the conformity scores. For instance, the default `None` value correspondonds to a conformity score which assumes y_obs = y_pred + conformity_score. - `None`, to use the default `AbsoluteConformityScore` conformity score - BaseRegressionScore: any `BaseRegressionScore` class By default `None`. random_state: Optional[Union[int, RandomState]] Pseudo random number generator state used for random sampling. Pass an int for reproducible output across multiple function calls. By default `None`. Attributes ---------- valid_methods_: List[str] List of all valid methods. estimator_: EnsembleRegressor Sklearn estimator that handle all that is related to the estimator. conformity_score_function_: BaseRegressionScore Score function that handle all that is related to conformity scores. conformity_scores_: ArrayLike of shape (n_samples_train,) Conformity scores between `y_train` and `y_pred`. n_features_in_: int Number of features passed to the `fit` method. References ---------- Rina Foygel Barber, Emmanuel J. Candès, Aaditya Ramdas, and Ryan J. Tibshirani. "Predictive inference with the jackknife+." Ann. Statist., 49(1):486-507, February 2021. Byol Kim, Chen Xu, and Rina Foygel Barber. "Predictive Inference Is Free with the Jackknife+-after-Bootstrap." 34th Conference on Neural Information Processing Systems (NeurIPS 2020). Examples -------- >>> import numpy as np >>> from mapie.regression.regression import _MapieRegressor >>> from sklearn.linear_model import LinearRegression >>> X_toy = np.array([[0], [1], [2], [3], [4], [5]]) >>> y_toy = np.array([5, 7.5, 9.5, 10.5, 12.5, 15]) >>> clf = LinearRegression().fit(X_toy, y_toy) >>> mapie_reg = _MapieRegressor(estimator=clf, cv="prefit") >>> mapie_reg = mapie_reg.fit(X_toy, y_toy) >>> y_pred, y_pis = mapie_reg.predict(X_toy, alpha=0.5) >>> print(y_pis[:, :, 0]) [[ 4.95714286 5.61428571] [ 6.84285714 7.5 ] [ 8.72857143 9.38571429] [10.61428571 11.27142857] [12.5 13.15714286] [14.38571429 15.04285714]] >>> print(y_pred) [ 5.28571429 7.17142857 9.05714286 10.94285714 12.82857143 14.71428571] """ cv_need_agg_function_ = ["Subsample"] no_agg_cv_ = ["prefit", "split"] valid_methods_ = ["naive", "base", "plus", "minmax"] no_agg_methods_ = ["naive", "base"] valid_agg_functions_ = [None, "median", "mean"] ensemble_agg_functions_ = ["median", "mean"] default_sym_ = True fit_attributes = [ "estimator_", "conformity_scores_", "conformity_score_function_", "n_features_in_", ] def __init__( self, estimator: Optional[RegressorMixin] = None, method: str = "plus", cv: Optional[Union[int, str, BaseCrossValidator]] = None, test_size: Optional[Union[int, float]] = None, n_jobs: Optional[int] = None, agg_function: Optional[str] = "mean", verbose: int = 0, conformity_score: Optional[BaseRegressionScore] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: self.estimator = estimator self.method = method self.cv = cv self.test_size = test_size self.n_jobs = n_jobs self.agg_function = agg_function self.verbose = verbose self.conformity_score = conformity_score self.random_state = random_state self._is_fitted = False @property def is_fitted(self): """Returns True if the estimator is fitted""" return self._is_fitted def _check_parameters(self) -> None: """ Perform several checks on input parameters. Raises ------ ValueError If parameters are not valid. """ self._check_method(self.method) _check_n_jobs(self.n_jobs) _check_verbose(self.verbose) check_random_state(self.random_state) def _check_method(self, method: str) -> str: """ Check if `method` is correct. Parameters ---------- method: str Method's name to check. Returns ------- str `method` itself. Raises ------ ValueError If `method` is not in `self.valid_methods_`. """ if method not in self.valid_methods_: raise ValueError( f"Invalid method. Allowed values are {self.valid_methods_}." ) else: return method def _check_agg_function(self, agg_function: Optional[str] = None) -> Optional[str]: """ Check if `agg_function` is correct, and consistent with other arguments. Parameters ---------- agg_function: Optional[str] Aggregation function's name to check, by default `None`. Returns ------- str `agg_function` itself or `"mean"`. Raises ------ ValueError If `agg_function` is not in [`None`, `"mean"`, `"median"`], or is `None` while cv class is in `cv_need_agg_function_`. """ if agg_function not in self.valid_agg_functions_: raise ValueError( "Invalid aggregation function. " f"Allowed values are '{self.valid_agg_functions_}'." ) elif (agg_function is None) and ( type(self.cv).__name__ in self.cv_need_agg_function_ ): raise ValueError("You need to specify an aggregation function.") elif agg_function is not None: return agg_function else: return "mean" def _check_estimator( self, estimator: Optional[RegressorMixin] = None ) -> RegressorMixin: """ Check if estimator is `None`, and returns a `LinearRegression` instance if necessary. If the `cv` attribute is `"prefit"`, check if estimator is indeed already fitted. Parameters ---------- estimator: Optional[RegressorMixin] Estimator to check, by default `None`. Returns ------- RegressorMixin The estimator itself or a default `LinearRegression` instance. Raises ------ ValueError If the estimator is not `None` and has no `fit` nor `predict` methods. NotFittedError If the estimator is not fitted and `cv` attribute is `"prefit"`. """ if estimator is None: return LinearRegression() else: _check_estimator_fit_predict(estimator) if self.cv == "prefit": if isinstance(estimator, Pipeline): check_sklearn_user_model_is_fitted(estimator[-1]) else: check_sklearn_user_model_is_fitted(estimator) return estimator def _check_ensemble( self, ensemble: bool, ) -> None: """ Check if `ensemble` is `False` and if `self.agg_function` is `None`. Else raise error. Parameters ---------- ensemble: bool `ensemble` argument to check the coherennce with `self.agg_function`. Raises ------ ValueError If `ensemble` is `True` and `self.agg_function` is `None`. """ if ensemble and (self.agg_function is None): raise ValueError( f"The aggregation function has to be in {self.ensemble_agg_functions_}." ) def _check_fit_parameters( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, ): """ Perform several checks on class parameters. Parameters ---------- X: ArrayLike Observed values. y: ArrayLike Target values. groups: Optional[ArrayLike] of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. By default `None`. Raises ------ ValueError If conformity score is FittedResidualNormalizing score and method is neither `"prefit"` or `"split"`. ValueError If `cv` is `"prefit"` or `"split"` and `method` is not `"base"`. """ # Checking self._check_parameters() cv = _check_cv( self.cv, test_size=self.test_size, random_state=self.random_state ) if self.cv in ["split", "prefit"] and self.method in [ "naive", "plus", "minmax", ]: self.method = "base" estimator = self._check_estimator(self.estimator) agg_function = self._check_agg_function(self.agg_function) cs_estimator = check_regression_conformity_score( self.conformity_score, self.default_sym_ ) if isinstance(cs_estimator, ResidualNormalisedScore) and self.cv not in [ "split", "prefit", ]: raise ValueError( "The ResidualNormalisedScore can be used only with " "`SplitConformalRegressor`" ) X, y = indexable(X, y) y = _check_y(y) # Handle sample_weight from fit_params sample_weight = self._fit_params.pop("sample_weight", None) sample_weight, X, y = _check_null_weight(sample_weight, X, y) if sample_weight is not None: self._fit_params["sample_weight"] = sample_weight self.n_features_in_ = _check_n_features_in(X) # Casting cv = cast(BaseCrossValidator, cv) estimator = cast(RegressorMixin, estimator) cs_estimator = cast(BaseRegressionScore, cs_estimator) agg_function = cast(Optional[str], agg_function) X = cast(NDArray, X) y = cast(NDArray, y) groups = cast(Optional[NDArray], groups) return (estimator, cs_estimator, agg_function, cv, X, y, groups) def fit( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, **kwargs: Any, ) -> _MapieRegressor: """ Fit estimator and compute conformity scores used for prediction intervals. All the types of estimator (single or cross validated ones) are encapsulated under EnsembleRegressor. Parameters ---------- X: ArrayLike of shape (n_samples, n_features) Training data. y: ArrayLike of shape (n_samples,) Training labels. groups: Optional[ArrayLike] of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. By default `None`. kwargs : dict Additional fit and predict parameters. Sample weights can be passed in ``fit_params={"sample_weight": ...}``. Returns ------- _MapieRegressor The model itself. """ X, y, groups = self.init_fit(X, y, groups, **kwargs) self.fit_estimator(X, y, groups) self.conformalize(X, y, groups, **kwargs) self._is_fitted = True return self def init_fit( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, **kwargs: Any, ): self._fit_params = _prepare_params(kwargs.pop("fit_params", {})) # Checks ( estimator, self.conformity_score_function_, agg_function, cv, X, y, groups, ) = self._check_fit_parameters(X, y, groups) self.estimator_ = EnsembleRegressor( estimator, self.method, cv, agg_function, self.n_jobs, self.test_size, self.verbose, ) return (X, y, groups) def fit_estimator( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, ) -> _MapieRegressor: self.estimator_.fit_single_estimator(X, y, groups=groups, **self._fit_params) return self def conformalize( self, X: ArrayLike, y: ArrayLike, groups: Optional[ArrayLike] = None, **kwargs: Any, ) -> _MapieRegressor: predict_params = kwargs.pop("predict_params", {}) self._predict_params = len(predict_params) > 0 self.estimator_.fit_multi_estimators(X, y, groups=groups, **self._fit_params) # Predict on calibration data y_pred = self.estimator_.predict_calib(X, y=y, groups=groups, **predict_params) # Compute the conformity scores (manage jk-ab case) self.conformity_scores_ = self.conformity_score_function_.get_conformity_scores( y, y_pred, X=X ) return self def predict( self, X: ArrayLike, ensemble: bool = False, alpha: Optional[Union[float, Iterable[float]]] = None, optimize_beta: bool = False, allow_infinite_bounds: bool = False, **predict_params, ) -> Union[NDArray, Tuple[NDArray, NDArray]]: """ Predict target on new samples with confidence intervals. Conformity scores from the training set and predictions from the model clones are central to the computation. Prediction Intervals for a given `alpha` are deduced from either - quantiles of conformity scores (`naive` and `base` methods), - quantiles of (predictions +/- conformity scores) (`plus` method), - quantiles of (max/min(predictions) +/- conformity scores) (`minmax` method). Parameters ---------- X: ArrayLike of shape (n_samples, n_features) Test data. ensemble: bool Boolean determining whether the predictions are ensembled or not. If `False`, predictions are those of the model trained on the whole training set. If `True`, predictions from perturbed models are aggregated by the aggregation function specified in the `agg_function` attribute. If `cv` is `"prefit"` or `"split"`, `ensemble` is ignored. By default `False`. alpha: Optional[Union[float, Iterable[float]]] Can be a float, a list of floats, or a `ArrayLike` of floats. Between `0` and `1`, represents the uncertainty of the confidence interval. Lower `alpha` produce larger (more conservative) prediction intervals. `alpha` is the complement of the target coverage level. By default `None`. optimize_beta: bool Whether to optimize the PIs' width or not. By default `False`. allow_infinite_bounds: bool Allow infinite prediction intervals to be produced. By default `False`. predict_params : dict Additional predict parameters. Returns ------- Union[NDArray, Tuple[NDArray, NDArray]] - NDArray of shape (n_samples,) if `alpha` is `None`. - Tuple[NDArray, NDArray] of shapes (n_samples,) and (n_samples, 2, n_alpha) if `alpha` is not `None``. - [:, 0, :]: Lower bound of the prediction interval. - [:, 1, :]: Upper bound of the prediction interval. """ # Checks if hasattr(self, "_predict_params"): _check_predict_params(self._predict_params, predict_params, self.cv) check_is_fitted(self) self._check_ensemble(ensemble) alpha = cast(Optional[NDArray], _check_alpha(alpha)) # If alpha is None, predict the target without confidence intervals if alpha is None: y_pred = self.estimator_.predict( X, ensemble, return_multi_pred=False, **predict_params ) return np.array(y_pred) else: # Check alpha and the number of effective calibration samples alpha_np = cast(NDArray, alpha) if not allow_infinite_bounds: n = self.conformity_score_function_.get_effective_calibration_samples( self.conformity_scores_ ) _check_alpha_and_n_samples(alpha_np, n) # Predict the target with confidence intervals outputs = self.conformity_score_function_.predict_set( X, alpha_np, estimator=self.estimator_, conformity_scores=self.conformity_scores_, ensemble=ensemble, method=self.method, optimize_beta=optimize_beta, allow_infinite_bounds=allow_infinite_bounds, ) y_pred, y_pred_low, y_pred_up = outputs return np.array(y_pred), np.stack([y_pred_low, y_pred_up], axis=1)