from __future__ import annotations
from typing import Any, Generator, Optional, Tuple, Union, cast
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from numpy.random import RandomState
from sklearn.model_selection import BaseCrossValidator
from sklearn.utils import check_random_state, resample
from sklearn.utils.validation import _num_samples
from numpy.typing import NDArray
from .utils import _check_n_samples
[docs]
class Subsample(BaseCrossValidator):
"""
Generate a sampling method, that resamples the training set with
possible bootstraps. It can be used as cv argument in
`JackknifeAfterBootstrapRegressor`.
Parameters
----------
n_resamplings : int
Number of resamplings. By default `30`.
n_samples: Union[int, float]
Number of samples in each resampling. By default `None`,
the size of the training set. If it is between 0 and 1,
it becomes the fraction of samples
replace: bool
Whether to replace samples in resamplings or not. By default `True`.
random_state: Optional[Union[int, RandomState]]
int or RandomState instance. By default `None`
Examples
--------
>>> import numpy as np
>>> from mapie.subsample import Subsample
>>> cv = Subsample(n_resamplings=2,random_state=0)
>>> X = np.array([1,2,3,4,5,6,7,8,9,10])
>>> for train_index, test_index in cv.split(X):
... print(f"train index is {train_index}, test index is {test_index}")
train index is [5 0 3 3 7 9 3 5 2 4], test index is [1 6 8]
train index is [7 6 8 8 1 6 7 7 8 1], test index is [0 2 3 4 5 9]
"""
[docs]
def __init__(
self,
n_resamplings: int = 30,
n_samples: Optional[Union[int, float]] = None,
replace: bool = True,
random_state: Optional[Union[int, RandomState]] = None,
) -> None:
self.n_resamplings = n_resamplings
self.n_samples = n_samples
self.replace = replace
self.random_state = random_state
[docs]
def split(
self, X: NDArray, *args: Any, **kargs: Any
) -> Generator[Tuple[NDArray, NDArray], None, None]:
"""
Generate indices to split data into training and test sets.
Parameters
----------
X : NDArray of shape (n_samples, n_features)
Training data.
Yields
------
train : NDArray of shape (n_indices_training,)
The training set indices for that split.
test : NDArray of shape (n_indices_test,)
The testing set indices for that split.
"""
indices = np.arange(_num_samples(X))
n_samples = _check_n_samples(X, self.n_samples, indices)
random_state = check_random_state(self.random_state)
for k in range(self.n_resamplings):
train_index = resample(
indices,
replace=self.replace,
n_samples=n_samples,
random_state=random_state,
stratify=None,
)
test_index = np.setdiff1d(indices, train_index)
yield train_index, test_index
[docs]
def get_n_splits(self, *args: Any, **kargs: Any) -> int:
"""
Returns the number of splitting iterations in the cross-validator.
Returns
-------
int
Returns the number of splitting iterations in the cross-validator.
"""
return self.n_resamplings
[docs]
class BlockBootstrap(BaseCrossValidator): # type: ignore
"""
Generate a sampling method, that block bootstraps the training set.
It can replace KFold, LeaveOneOut or SubSample as cv argument in the
TimeSeriesRegressor class.
Parameters
----------
n_resamplings : int
Number of resamplings. By default `30`.
length: int
Length of the blocks. By default `None`,
the length of the training set divided by `n_blocks`.
overlapping: bool
Whether the blocks can overlap or not. By default `False`.
n_blocks: int
Number of blocks in each resampling. By default `None`,
the size of the training set divided by `length`.
random_state: Optional
int or RandomState instance.
Raises
------
ValueError
If both `length` and `n_blocks` are `None`.
Examples
--------
>>> import numpy as np
>>> from mapie.subsample import BlockBootstrap
>>> cv = BlockBootstrap(n_resamplings=2, length=3, random_state=0)
>>> X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
>>> for train_index, test_index in cv.split(X):
... print(f"train index is {train_index}, test index is {test_index}")
train index is [0 1 2 3 4 5 0 1 2 3 4 5], test index is [8 9 6 7]
train index is [3 4 5 6 7 8 0 1 2 6 7 8], test index is [9]
"""
[docs]
def __init__(
self,
n_resamplings: int = 30,
length: Optional[int] = None,
n_blocks: Optional[int] = None,
overlapping: bool = False,
random_state: Optional[Union[int, RandomState]] = None,
) -> None:
self.n_resamplings = n_resamplings
self.length = length
self.n_blocks = n_blocks
self.overlapping = overlapping
self.random_state = random_state
[docs]
def split(
self, X: NDArray, *args: Any, **kargs: Any
) -> Generator[Tuple[NDArray, NDArray], None, None]:
"""
Generate indices to split data into training and test sets.
Parameters
----------
X : NDArray of shape (n_samples, n_features)
Training data.
Yields
------
train : NDArray of shape (n_indices_training,)
The training set indices for that split.
test : NDArray of shape (n_indices_test,)
The testing set indices for that split.
Raises
------
ValueError
If `length` is not positive or greater than the train set size.
"""
if (self.n_blocks is not None) + (self.length is not None) != 1:
raise ValueError(
"Exactly one argument between `length` or `n_blocks` has to be not None"
)
n = len(X)
if self.n_blocks is not None:
length = self.length if self.length is not None else n // self.n_blocks
n_blocks = self.n_blocks
else:
length = cast(int, self.length)
n_blocks = (n // length) + 1
indices = np.arange(n)
if (length <= 0) or (length > n):
raise ValueError(
"The length of blocks is <= 0 or greater than the length"
"of training set."
)
if self.overlapping:
blocks = sliding_window_view(indices, window_shape=length)
else:
if n % length == 0:
indices_used_for_blocks = indices
else:
indices_used_for_blocks = indices[: -(n % length)]
blocks_number = n // length
blocks = np.asarray(
np.split(indices_used_for_blocks, indices_or_sections=blocks_number)
)
random_state = check_random_state(self.random_state)
for k in range(self.n_resamplings):
block_indices = resample(
range(len(blocks)),
replace=True,
n_samples=n_blocks,
random_state=random_state,
stratify=None,
)
train_index = np.concatenate([blocks[k] for k in block_indices], axis=0)
test_index = np.array(list(set(indices) - set(train_index)), dtype=np.int64)
yield train_index, test_index
[docs]
def get_n_splits(self, *args: Any, **kargs: Any) -> int:
"""
Returns the number of splitting iterations in the cross-validator.
Returns
-------
int
Returns the number of splitting iterations in the cross-validator.
"""
return self.n_resamplings