Note
Go to the end to download the full example code.
The symmetric correction parameter in conformalized quantile regression
An example plot of ConformalizedQuantileRegressor illustrating the impact of the symmetric_correction parameter.
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from mapie.metrics.regression import regression_coverage_score
from mapie.regression import ConformalizedQuantileRegressor
from mapie.utils import train_conformalize_test_split
RANDOM_STATE = 1
We generate a synthetic data.
X, y = make_regression(
n_samples=1000, n_features=1, noise=20, random_state=RANDOM_STATE
)
(X_train, X_conformalize, X_test, y_train, y_conformalize, y_test) = (
train_conformalize_test_split(
X,
y,
train_size=0.6,
conformalize_size=0.2,
test_size=0.2,
random_state=RANDOM_STATE,
)
)
# Define confidence level
confidence_level = 0.8
# Initialize a Gradient Boosting Regressor for quantile regression
gb_reg = GradientBoostingRegressor(
loss="quantile", alpha=0.5, random_state=RANDOM_STATE
)
# Using ConformalizedQuantileRegressor
mapie_qr = ConformalizedQuantileRegressor(
estimator=gb_reg, confidence_level=confidence_level
)
mapie_qr.fit(X_train, y_train)
mapie_qr.conformalize(X_conformalize, y_conformalize)
y_pred_sym, y_pis_sym = mapie_qr.predict_interval(X_test, symmetric_correction=True)
y_pred_asym, y_pis_asym = mapie_qr.predict_interval(X_test, symmetric_correction=False)
y_qlow = mapie_qr._mapie_quantile_regressor.estimators_[0].predict(X_test)
y_qup = mapie_qr._mapie_quantile_regressor.estimators_[1].predict(X_test)
print(f"y.shape: {y.shape}")
print(f"y_pis_sym[:, 0].shape: {y_pis_sym[:, 0].shape}")
print(f"y_pis_sym[:, 1].shape: {y_pis_sym[:, 1].shape}")
# Calculate coverage scores
coverage_score_sym = regression_coverage_score(y_test, y_pis_sym)[0]
coverage_score_asym = regression_coverage_score(y_test, y_pis_asym)[0]
# Sort the values for plotting
order = np.argsort(X_test[:, 0])
X_test_sorted = X_test[order]
y_pred_sym_sorted = y_pred_sym[order]
y_pis_sym_sorted = y_pis_sym[order]
y_pred_asym_sorted = y_pred_asym[order]
y_pis_asym_sorted = y_pis_asym[order]
y_qlow = y_qlow[order]
y_qup = y_qup[order]
y.shape: (1000,)
y_pis_sym[:, 0].shape: (200, 1)
y_pis_sym[:, 1].shape: (200, 1)
We will plot the predictions and prediction intervals for both symmetric and asymmetric intervals. The line represents the predicted values, the dashed lines represent the prediction intervals, and the shaded area represents the symmetric and asymmetric prediction intervals.
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1)
plt.xlabel("x")
plt.ylabel("y")
plt.scatter(X_test, y_test, alpha=0.3)
plt.plot(X_test_sorted, y_qlow, color="C1")
plt.plot(X_test_sorted, y_qup, color="C1")
plt.plot(X_test_sorted, y_pis_sym_sorted[:, 0], color="C1", ls="--")
plt.plot(X_test_sorted, y_pis_sym_sorted[:, 1], color="C1", ls="--")
plt.fill_between(
X_test_sorted.ravel(),
y_pis_sym_sorted[:, 0].ravel(),
y_pis_sym_sorted[:, 1].ravel(),
alpha=0.2,
)
plt.title(
f"Symmetric Intervals\n"
f"Target and effective coverages for "
f"confidence_level={confidence_level:.2f}; coverage={coverage_score_sym:.3f})"
)
# Plot asymmetric prediction intervals
plt.subplot(1, 2, 2)
plt.xlabel("x")
plt.ylabel("y")
plt.scatter(X_test, y_test, alpha=0.3)
plt.plot(X_test_sorted, y_qlow, color="C2")
plt.plot(X_test_sorted, y_qup, color="C2")
plt.plot(X_test_sorted, y_pis_asym_sorted[:, 0], color="C2", ls="--")
plt.plot(X_test_sorted, y_pis_asym_sorted[:, 1], color="C2", ls="--")
plt.fill_between(
X_test_sorted.ravel(),
y_pis_asym_sorted[:, 0].ravel(),
y_pis_asym_sorted[:, 1].ravel(),
alpha=0.2,
)
plt.title(
f"Asymmetric Intervals\n"
f"Target and effective coverages for "
f"confidence_level={confidence_level:.2f}; coverage={coverage_score_sym:.3f})"
)
plt.tight_layout()
plt.show()

The symmetric intervals (symmetric_correction=True) use a combined set of residuals for both bounds, while the asymmetric intervals (symmetric_correction=False) use distinct residuals for each bound, allowing for more flexible and accurate intervals that reflect the heteroscedastic nature of the data. The resulting effective coverages demonstrate the theoretical guarantee of the target coverage level confidence_level.
Total running time of the script: (0 minutes 1.203 seconds)