Source code for mapie.mondrian

from __future__ import annotations

from copy import copy
from typing import Iterable, Optional, Tuple, Union, cast

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import _check_y, check_is_fitted, indexable

from mapie.calibration import MapieCalibrator
from mapie.classification import MapieClassifier
from mapie.conformity_scores import (
    AbsoluteConformityScore,
    APSConformityScore,
    GammaConformityScore,
    LACConformityScore,
    NaiveConformityScore,
    TopKConformityScore
)
from mapie.multi_label_classification import MapieMultiLabelClassifier
from mapie.regression import (
    MapieQuantileRegressor,
    MapieRegressor,
    MapieTimeSeriesRegressor
)
from mapie.utils import check_alpha
from mapie._typing import ArrayLike, NDArray


[docs]class MondrianCP(BaseEstimator): """Mondrian is a method for making conformal predictions for partition of individuals. The Mondrian method is implemented in the `MondrianCP` class. It takes as input a `MapieClassifier` or `MapieRegressor` estimator and fits a model for each group of individuals. The `MondrianCP` class can then be used to run a conformal prediction procedure for each of these groups and hence achieve marginal coverage on each of them. The underlying estimator must be used with `cv='prefit'` and the conformity score must be one of the following: - For `MapieClassifier`: 'lac', 'score', 'cumulated_score', 'aps' or 'topk' - For `MapieRegressor`: 'absolute' or 'gamma' Parameters ---------- mapie_estimator : Union[MapieClassifier, MapieRegressor] The estimator for which the Mondrian method will be applied. It must be used with `cv='prefit'` and the conformity score must be one of the following: - For `MapieClassifier`: 'lac', 'score', 'cumulated_score', 'aps' or 'topk' - For `MapieRegressor`: 'absolute' or 'gamma' Attributes ---------- partition_groups : NDArray The unique groups of individuals for which the estimator was fitted mapie_estimators : Dict A dictionary containing the fitted conformal estimator for each group of individuals References ---------- Vladimir Vovk, David Lindsay, Ilia Nouretdinov, and Alex Gammerman. Mondrian confidence machine. Technical report, Royal Holloway University of London, 2003 Examples -------- >>> import numpy as np >>> from sklearn.linear_model import LogisticRegression >>> from mapie.classification import MapieClassifier >>> from mapie.mondrian import MondrianCP >>> X_toy = np.arange(9).reshape(-1, 1) >>> y_toy = np.stack([0, 0, 1, 0, 1, 2, 1, 2, 2]) >>> partition_toy = [0, 0, 0, 0, 1, 1, 1, 1, 1] >>> clf = LogisticRegression(random_state=42).fit(X_toy, y_toy) >>> mapie = MondrianCP(MapieClassifier(estimator=clf, cv="prefit")).fit( ... X_toy, y_toy, partition=partition_toy ... ) >>> _, y_pi_mapie = mapie.predict( ... X_toy, partition=partition_toy, alpha=0.4) >>> print(y_pi_mapie[:, :, 0].astype(bool)) [[ True False False] [ True False False] [ True True False] [ True True False] [False True False] [False True True] [False False True] [False False True] [False False True]] """ not_allowed_estimators = ( MapieCalibrator, MapieMultiLabelClassifier, MapieQuantileRegressor, MapieTimeSeriesRegressor ) allowed_classification_ncs_str = [ "lac", "score", "cumulated_score", "aps", "top_k" ] allowed_classification_ncs_class = ( LACConformityScore, NaiveConformityScore, APSConformityScore, TopKConformityScore ) allowed_regression_ncs = ( AbsoluteConformityScore, GammaConformityScore ) fit_attributes = [ "partition_groups", "mapie_estimators" ]
[docs] def __init__( self, mapie_estimator: Union[MapieClassifier, MapieRegressor] ): self.mapie_estimator = mapie_estimator
[docs] def fit( self, X: ArrayLike, y: ArrayLike, partition: ArrayLike, **fit_params ) -> MondrianCP: """ Fit the Mondrian method Parameters ---------- X : ArrayLike of shape (n_samples, n_features) The input data y : ArrayLike of shape (n_samples,) or (n_samples, n_outputs) The target values partition : ArrayLike of shape (n_samples,) The groups of individuals. Must be defined by integers. There must be at least 2 individuals per group. **fit_params Additional keyword arguments to pass to the estimator's fit method that may be specific to the Mapie estimator used """ X, y, partition = self._check_fit_parameters(X, y, partition) self.partition_groups = np.unique(partition) self.mapie_estimators = {} if isinstance(self.mapie_estimator, MapieClassifier): self.n_classes = len(np.unique(y)) for group in self.partition_groups: mapie_group_estimator = copy(self.mapie_estimator) indices_groups = np.argwhere(partition == group)[:, 0] X_g = [X[index] for index in indices_groups] y_g = [y[index] for index in indices_groups] mapie_group_estimator.fit(X_g, y_g, **fit_params) self.mapie_estimators[group] = mapie_group_estimator return self
[docs] def predict( self, X: ArrayLike, partition: ArrayLike, alpha: Optional[Union[float, Iterable[float]]] = None, **predict_params ) -> Union[NDArray, Tuple[NDArray, NDArray]]: """ Perform conformal prediction for each group of individuals Parameters ---------- X : ArrayLike of shape (n_samples, n_features) The input data partition : ArrayLike of shape (n_samples,), optional The groups of individuals. Must be defined by integers. By default None. alpha : float or Iterable[float], optional The desired coverage level(s) for each group. By default None. **predict_params Additional keyword arguments to pass to the estimator's predict method that may be specific to the Mapie estimator used Returns ------- y_pred : NDArray of shape (n_samples,) or (n_samples, n_outputs) The predicted values y_pss : NDArray of shape (n_samples, n_outputs, n_alpha) The predicted sets for the desired levels of coverage """ check_is_fitted(self, self.fit_attributes) X = cast(NDArray, X) alpha_np = cast(NDArray, check_alpha(alpha)) if alpha_np is None and self.mapie_estimator.estimator is not None: return self.mapie_estimator.estimator.predict( X, **predict_params ) if isinstance(self.mapie_estimator, MapieClassifier): y_pred = np.empty((len(X), )) y_pss = np.empty((len(X), self.n_classes, len(alpha_np))) else: y_pred = np.empty((len(X),)) y_pss = np.empty((len(X), 2, len(alpha_np))) partition = self._check_partition_predict(X, partition) partition_groups = np.unique(partition) for _, group in enumerate(partition_groups): indices_groups = np.argwhere(partition == group)[:, 0] X_g = [X[index] for index in indices_groups] y_pred_g, y_pss_g = self.mapie_estimators[group].predict( X_g, alpha=alpha_np, **predict_params ) y_pred[indices_groups] = y_pred_g y_pss[indices_groups] = y_pss_g return y_pred, y_pss
def _check_cv(self): """ Check that the underlying Mapie estimator uses cv='prefit' Raises ------ ValueError If the underlying Mapie estimator does not use cv='prefit' """ if not self.mapie_estimator.cv == "prefit": raise ValueError( "Mondrian can only be used if the underlying Mapie" + "estimator uses cv='prefit'." ) def _check_partition_fit(self, X: NDArray, partition: NDArray): """ Check that each group is defined by an integer and check that there are at least 2 individuals per group Parameters ---------- X : NDArray of shape (n_samples, n_features) The input data partition : NDArray of shape (n_samples,) Raises ------ ValueError If the partition is not defined by integers If there is less than 2 individuals per group If the number of individuals in the partition is not equal to the number of rows in X """ if not np.issubdtype(partition.dtype, np.integer): raise ValueError("The partition must be defined by integers") _, counts = np.unique(partition, return_counts=True) if np.min(counts) < 2: raise ValueError("There must be at least 2 individuals per group") self._check_partition_length(X, partition) def _check_partition_predict( self, X: NDArray, partition: ArrayLike ) -> NDArray: """ Check that there is no new group in the prediction and that the number of individuals in the partition is equal to the number of rows in X Parameters ---------- X : NDArray of shape (n_samples, n_features) The input data partition : ArrayLike of shape (n_samples,) The groups of individuals. Must be defined by integers Returns ------- partition : NDArray of shape (n_samples,) Partition of the dataset Raises ------ ValueError If there is a new group in the prediction """ partition = cast(NDArray, np.array(partition)) if not np.all(np.isin(partition, self.partition_groups)): raise ValueError( "There is at least one new group in the prediction." ) self._check_partition_length(X, partition) return partition def _check_partition_length(self, X: NDArray, partition: NDArray): """ Check that the number of rows in the groups array is equal to the number of rows in the attributes array. Parameters ---------- X : NDArray of shape (n_samples, n_features) The individual data. partition : NDArray of shape (n_samples,) The groups of individuals. Must be defined by integers Raises ------ ValueError If the number of individuals in the partition is not equal to the number of rows in X """ if len(partition) != len(X): raise ValueError( "The number of individuals in the partition must " "be equal to the number of rows in X" ) def _check_estimator(self): """ Check that the estimator is not in the `not_allowed_estimators`. Raises ------ ValueError If the estimator is in the `not_allowed_estimators`. """ if isinstance(self.mapie_estimator, self.not_allowed_estimators): raise ValueError( "The estimator must be a MapieClassifier or MapieRegressor" ) def _check_confomity_score(self): """ Check that the conformity score is in `allowed_classification_ncs_str` or `allowed_classification_ncs_class` if the estimator is a `MapieClassifier` or in the `allowed_regression_ncs` if the estimator is a `MapieRegressor` Raises ------ ValueError If conformity score is not in the `allowed_classification_ncs_str` or `allowed_classification_ncs_class` if the estimator is a `MapieClassifier` or in the `allowed_regression_ncs` if the estimator is a `MapieRegressor`. """ if isinstance(self.mapie_estimator, MapieClassifier): if self.mapie_estimator.method is not None: if self.mapie_estimator.method not in \ self.allowed_classification_ncs_str: raise ValueError( "The conformity score for the MapieClassifier must " + f"be one of {self.allowed_classification_ncs_str}" ) if self.mapie_estimator.conformity_score is not None: if type(self.mapie_estimator.conformity_score) not in \ self.allowed_classification_ncs_class: raise ValueError( "The conformity score for the MapieClassifier must" + f" be one of {self.allowed_classification_ncs_class}" ) else: if self.mapie_estimator.conformity_score is not None: if not isinstance(self.mapie_estimator.conformity_score, self.allowed_regression_ncs): raise ValueError( "The conformity score for the MapieRegressor must " + f"be one of {self.allowed_regression_ncs}" ) def _check_fit_parameters( self, X: ArrayLike, y: ArrayLike, partition: ArrayLike ) -> Tuple[NDArray, NDArray, NDArray]: """ Perform checks on the input data, partition and the estimator Parameters ---------- X : ArrayLike of shape (n_samples, n_features) The input data y : ArrayLike of shape (n_samples,) or (n_samples, n_outputs) The target values partition : ArrayLike of shape (n_samples,) The groups of individuals. Must be defined by integers Returns ------- X : NDArray of shape (n_samples, n_features) The input data y : NDArray of shape (n_samples,) or (n_samples, n_outputs) The target values partition : NDArray of shape (n_samples,) The group values """ self._check_estimator() self._check_cv() self._check_confomity_score() X, y = indexable(X, y) y = _check_y(y) X = cast(NDArray, X) y = cast(NDArray, y) partition = cast(NDArray, np.array(partition)) self._check_partition_fit(X, partition) self._check_partition_length(X, partition) return X, y, partition