From 051f8a406b7336cba19695063bcb6221b742e04b Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:03:26 +0000 Subject: [PATCH 01/13] UPD: allow more split cv methods for regression --- mapie/estimator/estimator.py | 17 ++++--- mapie/regression/regression.py | 9 +++- mapie/regression/time_series_regression.py | 8 ++-- mapie/tests/test_quantile_regression.py | 32 +++++++++++++ mapie/tests/test_regression.py | 8 ++-- mapie/tests/test_utils.py | 56 ++++++++-------------- mapie/utils.py | 31 ++++++++++++ 7 files changed, 110 insertions(+), 51 deletions(-) diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py index 33bda7b3d..df3872b7d 100644 --- a/mapie/estimator/estimator.py +++ b/mapie/estimator/estimator.py @@ -5,13 +5,13 @@ import numpy as np from joblib import Parallel, delayed from sklearn.base import RegressorMixin, clone -from sklearn.model_selection import BaseCrossValidator, ShuffleSplit +from sklearn.model_selection import BaseCrossValidator from sklearn.utils import _safe_indexing from sklearn.utils.validation import (_num_samples, check_is_fitted) from mapie._typing import ArrayLike, NDArray from mapie.aggregation_functions import aggregate_all, phi2D -from mapie.utils import (check_nan_in_aposteriori_prediction, +from mapie.utils import (check_nan_in_aposteriori_prediction, check_no_agg_cv, fit_estimator) from mapie.estimator.interface import EnsembleEstimator @@ -278,7 +278,9 @@ def _aggregate_with_mask( ArrayLike of shape (n_samples_test,) Array of aggregated predictions for each testing sample. """ - if self.method in self.no_agg_methods_ or self.cv in self.no_agg_cv_: + if self.method in self.no_agg_methods_ or ( + check_no_agg_cv(self.cv, self.no_agg_cv_) + ): raise ValueError( "There should not be aggregation of predictions " f"if cv is in '{self.no_agg_cv_}' " @@ -434,8 +436,9 @@ def fit( ) for train_index, _ in cv.split(X) ) - if isinstance(cv, ShuffleSplit): - single_estimator_ = estimators_[0] + # In split-CP, we keep only the model fitted on train dataset + if check_no_agg_cv(cv, self.no_agg_cv_): + single_estimator_ = estimators_[0] self.single_estimator_ = single_estimator_ self.estimators_ = estimators_ @@ -487,7 +490,9 @@ def predict( if not return_multi_pred and not ensemble: return y_pred - if self.method in self.no_agg_methods_ or self.cv in self.no_agg_cv_: + if self.method in self.no_agg_methods_ or ( + check_no_agg_cv(self.cv, self.no_agg_cv_) + ): y_pred_multi_low = y_pred[:, np.newaxis] y_pred_multi_up = y_pred[:, np.newaxis] else: diff --git a/mapie/regression/regression.py b/mapie/regression/regression.py index 356193155..aa8e294f8 100644 --- a/mapie/regression/regression.py +++ b/mapie/regression/regression.py @@ -18,7 +18,8 @@ from mapie.utils import (check_alpha, check_alpha_and_n_samples, check_conformity_score, check_cv, check_estimator_fit_predict, check_n_features_in, - check_n_jobs, check_null_weight, check_verbose) + check_n_jobs, check_no_agg_cv, check_null_weight, + check_verbose) class MapieRegressor(BaseEstimator, RegressorMixin): @@ -315,7 +316,9 @@ def _check_agg_function( "You need to specify an aggregation function when " f"cv's type is in {self.cv_need_agg_function_}." ) - elif (agg_function is not None) or (self.cv in self.no_agg_cv_): + elif (agg_function is not None) or ( + check_no_agg_cv(self.cv, self.no_agg_cv_) + ): return agg_function else: return "mean" @@ -507,6 +510,8 @@ def fit( ) # Fit the prediction function self.estimator_ = self.estimator_.fit(X, y, sample_weight) + + # Predict on calibration data y_pred = self.estimator_.predict_calib(X) # Compute the conformity scores (manage jk-ab case) diff --git a/mapie/regression/time_series_regression.py b/mapie/regression/time_series_regression.py index 9e7354b98..03056fd51 100644 --- a/mapie/regression/time_series_regression.py +++ b/mapie/regression/time_series_regression.py @@ -11,7 +11,8 @@ from mapie._typing import ArrayLike, NDArray from mapie.aggregation_functions import aggregate_all from .regression import MapieRegressor -from mapie.utils import check_alpha, check_alpha_and_n_samples +from mapie.utils import (check_alpha, check_alpha_and_n_samples, + check_no_agg_cv) class MapieTimeSeriesRegressor(MapieRegressor): @@ -316,8 +317,9 @@ def predict( self.lower_quantiles_ = lower_quantiles self.higher_quantiles_ = higher_quantiles - if self.method in self.no_agg_methods_ \ - or self.cv in self.no_agg_cv_: + if self.method in self.no_agg_methods_ or ( + check_no_agg_cv(self.cv, self.no_agg_cv_) + ): y_pred_low = y_pred[:, np.newaxis] + lower_quantiles y_pred_up = y_pred[:, np.newaxis] + higher_quantiles else: diff --git a/mapie/tests/test_quantile_regression.py b/mapie/tests/test_quantile_regression.py index e37a176aa..58a50b9de 100644 --- a/mapie/tests/test_quantile_regression.py +++ b/mapie/tests/test_quantile_regression.py @@ -347,6 +347,38 @@ def test_results_for_same_alpha( np.testing.assert_allclose(y_pis[:, 1, 0], y_pis_clone[:, 1, 0]) +def test_ensemble_in_predict() -> None: + """Checking for ensemble defined in predict of CQR""" + mapie_reg = MapieQuantileRegressor() + mapie_reg.fit(X, y) + with pytest.warns( + UserWarning, match=r"WARNING: Alpha should not be spec.*" + ): + mapie_reg.predict(X, alpha=0.2) + + +def test_alpha_in_predict() -> None: + """Checking for alpha defined in predict of CQR""" + mapie_reg = MapieQuantileRegressor() + mapie_reg.fit(X, y) + with pytest.warns(UserWarning, match=r"WARNING: ensemble is not util*"): + mapie_reg.predict(X, ensemble=True) + + +@pytest.mark.parametrize("estimator", [-1, 3, 0.2]) +def test_quantile_prefit_non_iterable(estimator: Any) -> None: + """ + Test that there is a list of estimators provided when cv='prefit' + is called for MapieQuantileRegressor. + """ + with pytest.raises( + ValueError, + match=r".*Estimator for prefit must be an iterable object.*", + ): + mapie_reg = MapieQuantileRegressor(estimator=estimator, cv="prefit") + mapie_reg.fit([1, 2, 3], [4, 5, 6]) + + @pytest.mark.parametrize("alphas", ["hello", MapieQuantileRegressor, [2], 1]) def test_wrong_alphas_types(alphas: float) -> None: """Checking for wrong type of alphas""" diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py index 14557982f..ec235eee4 100644 --- a/mapie/tests/test_regression.py +++ b/mapie/tests/test_regression.py @@ -11,8 +11,8 @@ from sklearn.dummy import DummyRegressor from sklearn.impute import SimpleImputer from sklearn.linear_model import LinearRegression -from sklearn.model_selection import (KFold, LeaveOneOut, ShuffleSplit, - train_test_split) +from sklearn.model_selection import (KFold, LeaveOneOut, PredefinedSplit, + ShuffleSplit, train_test_split) from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_is_fitted @@ -211,7 +211,9 @@ def test_valid_agg_function(agg_function: str) -> None: @pytest.mark.parametrize( "cv", [None, -1, 2, KFold(), LeaveOneOut(), - ShuffleSplit(n_splits=1), "prefit", "split"] + ShuffleSplit(n_splits=1), + PredefinedSplit(test_fold=[-1]*3+[0]*3), + "prefit", "split"] ) def test_valid_cv(cv: Any) -> None: """Test that valid cv raise no errors.""" diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index 048a50f5c..69611e9da 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -7,18 +7,18 @@ from numpy.random import RandomState from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression -from sklearn.model_selection import BaseCrossValidator +from sklearn.model_selection import BaseCrossValidator, KFold, ShuffleSplit from sklearn.utils.validation import check_is_fitted from mapie._typing import ArrayLike, NDArray -from mapie.regression import MapieQuantileRegressor from mapie.utils import (check_alpha, check_alpha_and_n_samples, check_array_nan, check_array_inf, check_arrays_length, check_binary_zero_one, check_cv, check_lower_upper_bounds, check_n_features_in, - check_n_jobs, check_null_weight, check_number_bins, - check_split_strategy, check_verbose, - compute_quantiles, fit_estimator, get_binning_groups) + check_n_jobs, check_no_agg_cv, check_null_weight, + check_number_bins, check_split_strategy, + check_verbose, compute_quantiles, fit_estimator, + get_binning_groups) X_toy = np.array([0, 1, 2, 3, 4, 5]).reshape(-1, 1) y_toy = np.array([5, 7, 9, 11, 13, 15]) @@ -254,24 +254,6 @@ def test_final1D_low_high_pred() -> None: check_lower_upper_bounds(y_preds, y_pred_low, y_pred_up) -def test_ensemble_in_predict() -> None: - """Checking for ensemble defined in predict of CQR""" - mapie_reg = MapieQuantileRegressor() - mapie_reg.fit(X, y) - with pytest.warns( - UserWarning, match=r"WARNING: Alpha should not be spec.*" - ): - mapie_reg.predict(X, alpha=0.2) - - -def test_alpha_in_predict() -> None: - """Checking for alpha defined in predict of CQR""" - mapie_reg = MapieQuantileRegressor() - mapie_reg.fit(X, y) - with pytest.warns(UserWarning, match=r"WARNING: ensemble is not util*"): - mapie_reg.predict(X, ensemble=True) - - def test_compute_quantiles_value_error(): """Test that if the size of the last axis of vector is different from the number of aphas an error is raised. @@ -325,20 +307,6 @@ def test_compute_quantiles_2D_and_3D(alphas: NDArray): assert (quantiles1 == quantiles2).all() -@pytest.mark.parametrize("estimator", [-1, 3, 0.2]) -def test_quantile_prefit_non_iterable(estimator: Any) -> None: - """ - Test that there is a list of estimators provided when cv='prefit' - is called for MapieQuantileRegressor. - """ - with pytest.raises( - ValueError, - match=r".*Estimator for prefit must be an iterable object.*", - ): - mapie_reg = MapieQuantileRegressor(estimator=estimator, cv="prefit") - mapie_reg.fit([1, 2, 3], [4, 5, 6]) - - # def test_calib_set_no_Xy_but_sample_weight() -> None: # """Test warning message if sample weight provided but no X y in calib.""" # X = np.array([4, 5, 6]) @@ -474,3 +442,17 @@ def test_check_cv_same_split_no_random_state(cv: BaseCrossValidator) -> None: for i in range(cv.get_n_splits()): np.testing.assert_allclose(train_indices_1[i], train_indices_2[i]) + + +def test_check_no_agg_cv() -> None: + array = ["prefit", "split"] + np.testing.assert_almost_equal(check_no_agg_cv(1, array), True) + np.testing.assert_almost_equal(check_no_agg_cv(2, array), False) + cv = "split" + np.testing.assert_almost_equal(check_no_agg_cv(cv, array), True) + cv = KFold(5) + np.testing.assert_almost_equal(check_no_agg_cv(cv, array), False) + cv = ShuffleSplit(1) + np.testing.assert_almost_equal(check_no_agg_cv(cv, array), True) + cv = ShuffleSplit(2) + np.testing.assert_almost_equal(check_no_agg_cv(cv, array), False) diff --git a/mapie/utils.py b/mapie/utils.py index 51f9ec78d..bc3919682 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -208,6 +208,37 @@ def check_cv( ) +def check_no_agg_cv( + cv: Union[int, str, BaseCrossValidator, BaseShuffleSplit], + no_agg_cv_array: list, +) -> bool: + """ + Check if cross-validator is ``"prefit"``, ``"split"`` or any split + equivalent `BaseCrossValidator` or `BaseShuffleSplit`. + + Parameters + ---------- + cv: Union[int, str, BaseCrossValidator, BaseShuffleSplit] + Cross-validator to check. + + no_agg_cv_array: list + List of all non-aggregated cv methods. + + Returns + ------- + bool + True if `cv` is a split equivalent / non-aggregated cv method. + """ + if isinstance(cv, str): + return cv in no_agg_cv_array + elif isinstance(cv, int): + return cv == 1 + try: + return cv.get_n_splits() == 1 + except Exception: + return False + + def check_alpha( alpha: Optional[Union[float, Iterable[float]]] = None ) -> Optional[ArrayLike]: From d5b03a520e7bb5458b87f1c95ab813d40e96ffb0 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:20:19 +0000 Subject: [PATCH 02/13] UPD: HISTORY.rst --- HISTORY.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 2d8b62c52..1ab29b21e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,8 +4,9 @@ History ##### (##########) ------------------ -* Add new checks for metrics calculations -* Fix reference for residual normalised score in documentation +* Allow to use more split methods for MapieRegressor (ShuffleSplit, PredefinedSplit). +* Add new checks for metrics calculations. +* Fix reference for residual normalised score in documentation. 0.7.0 (2023-09-14) From 31583ce72950eadb34680641dc49fdfe45967c72 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:37:45 +0000 Subject: [PATCH 03/13] FIX: typos in docstring --- mapie/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mapie/utils.py b/mapie/utils.py index bc3919682..5c4ce15ff 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -129,13 +129,14 @@ def fit_estimator( def check_cv( - cv: Optional[Union[int, str, BaseCrossValidator]] = None, + cv: Optional[Union[int, str, BaseCrossValidator, BaseShuffleSplit]] = None, test_size: Optional[Union[int, float]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, -) -> Union[str, BaseCrossValidator]: +) -> Union[str, BaseCrossValidator, BaseShuffleSplit]: """ Check if cross-validator is - ``None``, ``int``, ``"prefit"``, ``"split"``or ``BaseCrossValidator``. + ``None``, ``int``, ``"prefit"``, ``"split"``, ``BaseCrossValidator`` or + ``BaseShuffleSplit``. Return a ``LeaveOneOut`` instance if integer equal to -1. Return a ``KFold`` instance if integer superior or equal to 2. Return a ``KFold`` instance if ``None``. @@ -143,7 +144,7 @@ def check_cv( Parameters ---------- - cv: Optional[Union[int, str, BaseCrossValidator]], optional + cv: Optional[Union[int, str, BaseCrossValidator, BaseShuffleSplit]] Cross-validator to check, by default ``None``. test_size: Optional[Union[int, float]] @@ -163,8 +164,8 @@ def check_cv( Returns ------- - Optional[Union[float, str]] - 'prefit' or None. + Union[str, BaseCrossValidator, BaseShuffleSplit] + The cast `cv` parameter. Raises ------ From b01c2dc230f91702db31032d1ef1641e4a2cd019 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 13 Dec 2023 15:04:29 +0000 Subject: [PATCH 04/13] FIX: make lint error --- mapie/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/utils.py b/mapie/utils.py index 5c4ce15ff..b3ca81e29 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -135,7 +135,7 @@ def check_cv( ) -> Union[str, BaseCrossValidator, BaseShuffleSplit]: """ Check if cross-validator is - ``None``, ``int``, ``"prefit"``, ``"split"``, ``BaseCrossValidator`` or + ``None``, ``int``, ``"prefit"``, ``"split"``, ``BaseCrossValidator`` or ``BaseShuffleSplit``. Return a ``LeaveOneOut`` instance if integer equal to -1. Return a ``KFold`` instance if integer superior or equal to 2. From 29dfbd8f386f12baff562310c26e94d62c9a7244 Mon Sep 17 00:00:00 2001 From: vincentblot28 Date: Mon, 18 Dec 2023 17:03:28 +0100 Subject: [PATCH 05/13] ENH: move back Quantile regression tests --- mapie/tests/test_quantile_regression.py | 32 ------------------------ mapie/tests/test_utils.py | 33 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/mapie/tests/test_quantile_regression.py b/mapie/tests/test_quantile_regression.py index 58a50b9de..e37a176aa 100644 --- a/mapie/tests/test_quantile_regression.py +++ b/mapie/tests/test_quantile_regression.py @@ -347,38 +347,6 @@ def test_results_for_same_alpha( np.testing.assert_allclose(y_pis[:, 1, 0], y_pis_clone[:, 1, 0]) -def test_ensemble_in_predict() -> None: - """Checking for ensemble defined in predict of CQR""" - mapie_reg = MapieQuantileRegressor() - mapie_reg.fit(X, y) - with pytest.warns( - UserWarning, match=r"WARNING: Alpha should not be spec.*" - ): - mapie_reg.predict(X, alpha=0.2) - - -def test_alpha_in_predict() -> None: - """Checking for alpha defined in predict of CQR""" - mapie_reg = MapieQuantileRegressor() - mapie_reg.fit(X, y) - with pytest.warns(UserWarning, match=r"WARNING: ensemble is not util*"): - mapie_reg.predict(X, ensemble=True) - - -@pytest.mark.parametrize("estimator", [-1, 3, 0.2]) -def test_quantile_prefit_non_iterable(estimator: Any) -> None: - """ - Test that there is a list of estimators provided when cv='prefit' - is called for MapieQuantileRegressor. - """ - with pytest.raises( - ValueError, - match=r".*Estimator for prefit must be an iterable object.*", - ): - mapie_reg = MapieQuantileRegressor(estimator=estimator, cv="prefit") - mapie_reg.fit([1, 2, 3], [4, 5, 6]) - - @pytest.mark.parametrize("alphas", ["hello", MapieQuantileRegressor, [2], 1]) def test_wrong_alphas_types(alphas: float) -> None: """Checking for wrong type of alphas""" diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index 69611e9da..9f8feea20 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -11,6 +11,7 @@ from sklearn.utils.validation import check_is_fitted from mapie._typing import ArrayLike, NDArray +from mapie.regression import MapieQuantileRegressor from mapie.utils import (check_alpha, check_alpha_and_n_samples, check_array_nan, check_array_inf, check_arrays_length, check_binary_zero_one, check_cv, @@ -254,6 +255,24 @@ def test_final1D_low_high_pred() -> None: check_lower_upper_bounds(y_preds, y_pred_low, y_pred_up) +def test_ensemble_in_predict() -> None: + """Checking for ensemble defined in predict of CQR""" + mapie_reg = MapieQuantileRegressor() + mapie_reg.fit(X, y) + with pytest.warns( + UserWarning, match=r"WARNING: Alpha should not be spec.*" + ): + mapie_reg.predict(X, alpha=0.2) + + +def test_alpha_in_predict() -> None: + """Checking for alpha defined in predict of CQR""" + mapie_reg = MapieQuantileRegressor() + mapie_reg.fit(X, y) + with pytest.warns(UserWarning, match=r"WARNING: ensemble is not util*"): + mapie_reg.predict(X, ensemble=True) + + def test_compute_quantiles_value_error(): """Test that if the size of the last axis of vector is different from the number of aphas an error is raised. @@ -307,6 +326,20 @@ def test_compute_quantiles_2D_and_3D(alphas: NDArray): assert (quantiles1 == quantiles2).all() +@pytest.mark.parametrize("estimator", [-1, 3, 0.2]) +def test_quantile_prefit_non_iterable(estimator: Any) -> None: + """ + Test that there is a list of estimators provided when cv='prefit' + is called for MapieQuantileRegressor. + """ + with pytest.raises( + ValueError, + match=r".*Estimator for prefit must be an iterable object.*", + ): + mapie_reg = MapieQuantileRegressor(estimator=estimator, cv="prefit") + mapie_reg.fit([1, 2, 3], [4, 5, 6]) + + # def test_calib_set_no_Xy_but_sample_weight() -> None: # """Test warning message if sample weight provided but no X y in calib.""" # X = np.array([4, 5, 6]) From ecc1d79fcfc96c0c961d8a4ad6b42b930cf5c2ed Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 14:51:22 +0100 Subject: [PATCH 06/13] UPD: add details in value error --- mapie/estimator/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py index df3872b7d..ce44d6bf6 100644 --- a/mapie/estimator/estimator.py +++ b/mapie/estimator/estimator.py @@ -283,7 +283,7 @@ def _aggregate_with_mask( ): raise ValueError( "There should not be aggregation of predictions " - f"if cv is in '{self.no_agg_cv_}' " + f"if cv is in '{self.no_agg_cv_}', if cv >=2 " f"or if method is in '{self.no_agg_methods_}'." ) elif self.agg_function == "median": From d557201d89a1162e21fcadbfc836769d01ccccfc Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 14:04:52 +0000 Subject: [PATCH 07/13] UPD: adapt test utils (short) --- mapie/tests/test_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index 9f8feea20..c7e9401ff 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Optional +from typing import Any, Optional, Tuple import numpy as np import pytest @@ -477,15 +477,15 @@ def test_check_cv_same_split_no_random_state(cv: BaseCrossValidator) -> None: np.testing.assert_allclose(train_indices_1[i], train_indices_2[i]) -def test_check_no_agg_cv() -> None: +@pytest.mark.parametrize( + "cv_result", [ + (1, True), (2, False), + ("split", True), (KFold(5), False), + (ShuffleSplit(1), True), + (ShuffleSplit(2), False) + ] +) +def test_check_no_agg_cv(cv_result: Tuple) -> None: array = ["prefit", "split"] - np.testing.assert_almost_equal(check_no_agg_cv(1, array), True) - np.testing.assert_almost_equal(check_no_agg_cv(2, array), False) - cv = "split" - np.testing.assert_almost_equal(check_no_agg_cv(cv, array), True) - cv = KFold(5) - np.testing.assert_almost_equal(check_no_agg_cv(cv, array), False) - cv = ShuffleSplit(1) - np.testing.assert_almost_equal(check_no_agg_cv(cv, array), True) - cv = ShuffleSplit(2) - np.testing.assert_almost_equal(check_no_agg_cv(cv, array), False) + cv, result = cv_result + np.testing.assert_almost_equal(check_no_agg_cv(cv, array), result) From e085a309466139ce8d99da180289b9a8bab61321 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:08:24 +0000 Subject: [PATCH 08/13] UPD: calculate use_split_method once with X dependency --- mapie/estimator/estimator.py | 12 +++++------- mapie/regression/regression.py | 7 ++----- mapie/regression/time_series_regression.py | 5 ++--- mapie/tests/test_regression.py | 1 + mapie/tests/test_utils.py | 2 +- mapie/utils.py | 10 +++++++--- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py index ce44d6bf6..944a40f00 100644 --- a/mapie/estimator/estimator.py +++ b/mapie/estimator/estimator.py @@ -152,6 +152,7 @@ class EnsembleRegressor(EnsembleEstimator): "single_estimator_", "estimators_", "k_", + "use_split_method", ] def __init__( @@ -278,9 +279,7 @@ def _aggregate_with_mask( ArrayLike of shape (n_samples_test,) Array of aggregated predictions for each testing sample. """ - if self.method in self.no_agg_methods_ or ( - check_no_agg_cv(self.cv, self.no_agg_cv_) - ): + if self.method in self.no_agg_methods_ or self.use_split_method: raise ValueError( "There should not be aggregation of predictions " f"if cv is in '{self.no_agg_cv_}', if cv >=2 " @@ -408,6 +407,7 @@ def fit( estimators_: List[RegressorMixin] = [] full_indexes = np.arange(_num_samples(X)) cv = self.cv + self.use_split_method = check_no_agg_cv(X, self.cv, self.no_agg_cv_) estimator = self.estimator n_samples = _num_samples(y) @@ -437,7 +437,7 @@ def fit( for train_index, _ in cv.split(X) ) # In split-CP, we keep only the model fitted on train dataset - if check_no_agg_cv(cv, self.no_agg_cv_): + if self.use_split_method: single_estimator_ = estimators_[0] self.single_estimator_ = single_estimator_ @@ -490,9 +490,7 @@ def predict( if not return_multi_pred and not ensemble: return y_pred - if self.method in self.no_agg_methods_ or ( - check_no_agg_cv(self.cv, self.no_agg_cv_) - ): + if self.method in self.no_agg_methods_ or self.use_split_method: y_pred_multi_low = y_pred[:, np.newaxis] y_pred_multi_up = y_pred[:, np.newaxis] else: diff --git a/mapie/regression/regression.py b/mapie/regression/regression.py index 339b4537e..9856e37d7 100644 --- a/mapie/regression/regression.py +++ b/mapie/regression/regression.py @@ -18,8 +18,7 @@ from mapie.utils import (check_alpha, check_alpha_and_n_samples, check_conformity_score, check_cv, check_estimator_fit_predict, check_n_features_in, - check_n_jobs, check_no_agg_cv, check_null_weight, - check_verbose) + check_n_jobs, check_null_weight, check_verbose) class MapieRegressor(BaseEstimator, RegressorMixin): @@ -316,9 +315,7 @@ def _check_agg_function( "You need to specify an aggregation function when " f"cv's type is in {self.cv_need_agg_function_}." ) - elif (agg_function is not None) or ( - check_no_agg_cv(self.cv, self.no_agg_cv_) - ): + elif agg_function is not None: return agg_function else: return "mean" diff --git a/mapie/regression/time_series_regression.py b/mapie/regression/time_series_regression.py index 7e2857aea..322954ed4 100644 --- a/mapie/regression/time_series_regression.py +++ b/mapie/regression/time_series_regression.py @@ -12,8 +12,7 @@ from mapie.aggregation_functions import aggregate_all from mapie.conformity_scores import ConformityScore from .regression import MapieRegressor -from mapie.utils import (check_alpha, check_alpha_and_n_samples, - check_no_agg_cv) +from mapie.utils import check_alpha, check_alpha_and_n_samples class MapieTimeSeriesRegressor(MapieRegressor): @@ -369,7 +368,7 @@ def predict( self.higher_quantiles_ = higher_quantiles if self.method in self.no_agg_methods_ or ( - check_no_agg_cv(self.cv, self.no_agg_cv_) + self.estimator_.use_split_method ): y_pred_low = y_pred[:, np.newaxis] + lower_quantiles y_pred_up = y_pred[:, np.newaxis] + higher_quantiles diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py index ec235eee4..6d26593df 100644 --- a/mapie/tests/test_regression.py +++ b/mapie/tests/test_regression.py @@ -528,6 +528,7 @@ def test_aggregate_with_mask_with_invalid_agg_function() -> None: 0.20, False ) + ens_reg.use_split_method = False with pytest.raises( ValueError, match=r".*The value of self.agg_function is not correct*", diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index c7e9401ff..a59ba1e31 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -488,4 +488,4 @@ def test_check_cv_same_split_no_random_state(cv: BaseCrossValidator) -> None: def test_check_no_agg_cv(cv_result: Tuple) -> None: array = ["prefit", "split"] cv, result = cv_result - np.testing.assert_almost_equal(check_no_agg_cv(cv, array), result) + np.testing.assert_almost_equal(check_no_agg_cv(X_toy, cv, array), result) diff --git a/mapie/utils.py b/mapie/utils.py index b3ca81e29..73b5fd2b1 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -210,6 +210,7 @@ def check_cv( def check_no_agg_cv( + X: ArrayLike, cv: Union[int, str, BaseCrossValidator, BaseShuffleSplit], no_agg_cv_array: list, ) -> bool: @@ -219,6 +220,9 @@ def check_no_agg_cv( Parameters ---------- + X: ArrayLike of shape (n_samples, n_features) + Training data. + cv: Union[int, str, BaseCrossValidator, BaseShuffleSplit] Cross-validator to check. @@ -234,9 +238,9 @@ def check_no_agg_cv( return cv in no_agg_cv_array elif isinstance(cv, int): return cv == 1 - try: - return cv.get_n_splits() == 1 - except Exception: + if hasattr(cv, "get_n_splits"): + return cv.get_n_splits(X) == 1 + else: return False From 4147f2bce110a7cfb48ff713f609dd8586d5bdac Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:16:34 +0000 Subject: [PATCH 09/13] ADD: unit test when cv has not get_n_splits method --- mapie/tests/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index a59ba1e31..de01dde2f 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -482,7 +482,8 @@ def test_check_cv_same_split_no_random_state(cv: BaseCrossValidator) -> None: (1, True), (2, False), ("split", True), (KFold(5), False), (ShuffleSplit(1), True), - (ShuffleSplit(2), False) + (ShuffleSplit(2), False), + (object(), False) ] ) def test_check_no_agg_cv(cv_result: Tuple) -> None: From e70be794a6735a1214f4f5403216088dca773a73 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:56:07 +0000 Subject: [PATCH 10/13] UPD: remove useless brackets --- mapie/regression/time_series_regression.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mapie/regression/time_series_regression.py b/mapie/regression/time_series_regression.py index 322954ed4..1e95f8b6e 100644 --- a/mapie/regression/time_series_regression.py +++ b/mapie/regression/time_series_regression.py @@ -367,9 +367,8 @@ def predict( self.lower_quantiles_ = lower_quantiles self.higher_quantiles_ = higher_quantiles - if self.method in self.no_agg_methods_ or ( - self.estimator_.use_split_method - ): + if self.method in self.no_agg_methods_ or \ + self.estimator_.use_split_method: y_pred_low = y_pred[:, np.newaxis] + lower_quantiles y_pred_up = y_pred[:, np.newaxis] + higher_quantiles else: From 21174f2c1d1727e0f1b4a2ddda9c36f7428bb644 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:57:11 +0000 Subject: [PATCH 11/13] FIX: lint --- mapie/regression/time_series_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapie/regression/time_series_regression.py b/mapie/regression/time_series_regression.py index 1e95f8b6e..99f71cf1c 100644 --- a/mapie/regression/time_series_regression.py +++ b/mapie/regression/time_series_regression.py @@ -368,7 +368,7 @@ def predict( self.higher_quantiles_ = higher_quantiles if self.method in self.no_agg_methods_ or \ - self.estimator_.use_split_method: + self.estimator_.use_split_method: y_pred_low = y_pred[:, np.newaxis] + lower_quantiles y_pred_up = y_pred[:, np.newaxis] + higher_quantiles else: From 1fa8b7c3db8a836717fe602e82b27c76e445525a Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 17:14:25 +0000 Subject: [PATCH 12/13] FIX: change the attribute to protected --- mapie/estimator/estimator.py | 10 +++++----- mapie/tests/test_regression.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py index 72828c6b9..27ccca8c4 100644 --- a/mapie/estimator/estimator.py +++ b/mapie/estimator/estimator.py @@ -152,7 +152,7 @@ class EnsembleRegressor(EnsembleEstimator): "single_estimator_", "estimators_", "k_", - "use_split_method", + "use_split_method_", ] def __init__( @@ -279,7 +279,7 @@ def _aggregate_with_mask( ArrayLike of shape (n_samples_test,) Array of aggregated predictions for each testing sample. """ - if self.method in self.no_agg_methods_ or self.use_split_method: + if self.method in self.no_agg_methods_ or self.use_split_method_: raise ValueError( "There should not be aggregation of predictions " f"if cv is in '{self.no_agg_cv_}', if cv >=2 " @@ -407,7 +407,7 @@ def fit( estimators_: List[RegressorMixin] = [] full_indexes = np.arange(_num_samples(X)) cv = self.cv - self.use_split_method = check_no_agg_cv(X, self.cv, self.no_agg_cv_) + self.use_split_method_ = check_no_agg_cv(X, self.cv, self.no_agg_cv_) estimator = self.estimator n_samples = _num_samples(y) @@ -437,7 +437,7 @@ def fit( for train_index, _ in cv.split(X) ) # In split-CP, we keep only the model fitted on train dataset - if self.use_split_method: + if self.use_split_method_: single_estimator_ = estimators_[0] self.single_estimator_ = single_estimator_ @@ -490,7 +490,7 @@ def predict( if not return_multi_pred and not ensemble: return y_pred - if self.method in self.no_agg_methods_ or self.use_split_method: + if self.method in self.no_agg_methods_ or self.use_split_method_: y_pred_multi_low = y_pred[:, np.newaxis] y_pred_multi_up = y_pred[:, np.newaxis] else: diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py index 5f686587b..06c42e88c 100644 --- a/mapie/tests/test_regression.py +++ b/mapie/tests/test_regression.py @@ -528,7 +528,7 @@ def test_aggregate_with_mask_with_invalid_agg_function() -> None: 0.20, False ) - ens_reg.use_split_method = False + ens_reg.use_split_method_ = False with pytest.raises( ValueError, match=r".*The value of self.agg_function is not correct*", From f46f117b152aad26b39fba3afa057d3b96875240 Mon Sep 17 00:00:00 2001 From: Thibault Cordier <124613154+thibaultcordier@users.noreply.github.com> Date: Wed, 20 Dec 2023 17:41:43 +0000 Subject: [PATCH 13/13] UPD: add raise value error with wrong cv parameter --- mapie/tests/test_utils.py | 17 +++++++++++++++-- mapie/utils.py | 8 ++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py index de01dde2f..ca9346ef6 100644 --- a/mapie/tests/test_utils.py +++ b/mapie/tests/test_utils.py @@ -7,7 +7,8 @@ from numpy.random import RandomState from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression -from sklearn.model_selection import BaseCrossValidator, KFold, ShuffleSplit +from sklearn.model_selection import (BaseCrossValidator, KFold, LeaveOneOut, + ShuffleSplit) from sklearn.utils.validation import check_is_fitted from mapie._typing import ArrayLike, NDArray @@ -483,10 +484,22 @@ def test_check_cv_same_split_no_random_state(cv: BaseCrossValidator) -> None: ("split", True), (KFold(5), False), (ShuffleSplit(1), True), (ShuffleSplit(2), False), - (object(), False) + (LeaveOneOut(), False), ] ) def test_check_no_agg_cv(cv_result: Tuple) -> None: + """Test that if `check_no_agg_cv` function returns the expected result.""" array = ["prefit", "split"] cv, result = cv_result np.testing.assert_almost_equal(check_no_agg_cv(X_toy, cv, array), result) + + +@pytest.mark.parametrize("cv", [object()]) +def test_check_no_agg_cv_value_error(cv: Any) -> None: + """Test that if `check_no_agg_cv` function raises value error.""" + array = ["prefit", "split"] + with pytest.raises( + ValueError, + match=r"Allowed values must have the `get_n_splits` method" + ): + check_no_agg_cv(X_toy, cv, array) diff --git a/mapie/utils.py b/mapie/utils.py index 4145e3868..a70583f87 100644 --- a/mapie/utils.py +++ b/mapie/utils.py @@ -238,10 +238,14 @@ def check_no_agg_cv( return cv in no_agg_cv_array elif isinstance(cv, int): return cv == 1 - if hasattr(cv, "get_n_splits"): + elif hasattr(cv, "get_n_splits"): return cv.get_n_splits(X) == 1 else: - return False + raise ValueError( + "Invalid cv argument. " + "Allowed values must have the `get_n_splits` method " + "with zero or one parameter (X)." + ) def check_alpha(