From 22587c229c15d9854a94955b4f34ce50e4b93c4f Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Tue, 17 Dec 2024 19:24:52 +0100 Subject: [PATCH] REFACTO: refactor MapieQuantileRegressor internals to prepare for v1: reorganized and renamed functions, fixed 1 test --- mapie/regression/quantile_regression.py | 164 ++++++++++-------------- mapie/tests/test_quantile_regression.py | 10 +- 2 files changed, 75 insertions(+), 99 deletions(-) diff --git a/mapie/regression/quantile_regression.py b/mapie/regression/quantile_regression.py index d86b0de67..3d2e30c3e 100644 --- a/mapie/regression/quantile_regression.py +++ b/mapie/regression/quantile_regression.py @@ -346,13 +346,11 @@ def _check_cv( "Invalid cv method, only valid method is ``split``." ) - def _check_calib_set( + def _train_calib_split( self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None, - X_calib: Optional[ArrayLike] = None, - y_calib: Optional[ArrayLike] = None, calib_size: Optional[float] = 0.3, random_state: Optional[Union[int, np.random.RandomState, None]] = None, shuffle: Optional[bool] = True, @@ -360,61 +358,33 @@ def _check_calib_set( ) -> Tuple[ ArrayLike, ArrayLike, ArrayLike, ArrayLike, Optional[ArrayLike] ]: - """ - Check if a calibration set has already been defined, if not, then - we define one using the ``train_test_split`` method. - - Parameters - ---------- - Same definition of parameters as for the ``fit`` method. - - Returns - ------- - Tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike, ArrayLike] - - [0]: ArrayLike of shape (n_samples_*(1-calib_size), n_features) - X_train - - [1]: ArrayLike of shape (n_samples_*(1-calib_size),) - y_train - - [2]: ArrayLike of shape (n_samples_*calib_size, n_features) - X_calib - - [3]: ArrayLike of shape (n_samples_*calib_size,) - y_calib - - [4]: ArrayLike of shape (n_samples_,) - sample_weight_train - """ - if X_calib is None or y_calib is None: - if sample_weight is None: - X_train, X_calib, y_train, y_calib = train_test_split( - X, - y, - test_size=calib_size, - random_state=random_state, - shuffle=shuffle, - stratify=stratify - ) - sample_weight_train = sample_weight - else: - ( - X_train, - X_calib, - y_train, - y_calib, - sample_weight_train, - _, - ) = train_test_split( - X, - y, - sample_weight, - test_size=calib_size, - random_state=random_state, - shuffle=shuffle, - stratify=stratify - ) + if sample_weight is None: + X_train, X_calib, y_train, y_calib = train_test_split( + X, + y, + test_size=calib_size, + random_state=random_state, + shuffle=shuffle, + stratify=stratify + ) + sample_weight_train = sample_weight else: - X_train, y_train, sample_weight_train = X, y, sample_weight - X_train, X_calib = cast(ArrayLike, X_train), cast(ArrayLike, X_calib) - y_train, y_calib = cast(ArrayLike, y_train), cast(ArrayLike, y_calib) - sample_weight_train = cast(ArrayLike, sample_weight_train) + ( + X_train, + X_calib, + y_train, + y_calib, + sample_weight_train, + _, + ) = train_test_split( + X, + y, + sample_weight, + test_size=calib_size, + random_state=random_state, + shuffle=shuffle, + stratify=stratify + ) return X_train, y_train, X_calib, y_calib, sample_weight_train def _check_prefit_params( @@ -547,13 +517,12 @@ def fit( MapieQuantileRegressor The model itself. """ - - self.initialize_fit() + self._initialize_fit_conformalize() if self.cv == "prefit": - X_calib, y_calib = self.prefit_estimators(X, y) + X_calib, y_calib = X, y else: - X_calib, y_calib = self.fit_estimators( + X_calib, y_calib = self._fit_estimators( X=X, y=y, sample_weight=sample_weight, @@ -571,26 +540,18 @@ def fit( return self - def initialize_fit(self) -> None: + def _initialize_fit_conformalize(self) -> None: self.cv = self._check_cv(cast(str, self.cv)) self.alpha_np = self._check_alpha(self.alpha) self.estimators_: List[RegressorMixin] = [] - def prefit_estimators( - self, - X: ArrayLike, - y: ArrayLike - ) -> Tuple[ArrayLike, ArrayLike]: - + def _initialize_and_check_prefit_estimators(self) -> None: estimator = cast(List, self.estimator) self._check_prefit_params(estimator) self.estimators_ = list(estimator) self.single_estimator_ = self.estimators_[2] - X_calib, y_calib = indexable(X, y) - return X_calib, y_calib - - def fit_estimators( + def _fit_estimators( self, X: ArrayLike, y: ArrayLike, @@ -604,30 +565,39 @@ def fit_estimators( stratify: Optional[ArrayLike] = None, **fit_params, ) -> Tuple[ArrayLike, ArrayLike]: + """ + This method: + - Creates train and calib sets + - Checks adn casts params, including the train set + - Fit the 3 estimators + - Returns the calib set + """ self._check_parameters() checked_estimator = self._check_estimator(self.estimator) random_state = check_random_state(random_state) X, y = indexable(X, y) - results = self._check_calib_set( - X, - y, - sample_weight, - X_calib, - y_calib, - calib_size, - random_state, - shuffle, - stratify, - ) + if X_calib is None or y_calib is None: + ( + X_train, y_train, X_calib, y_calib, sample_weight_train + ) = self._train_calib_split( + X, + y, + sample_weight, + calib_size, + random_state, + shuffle, + stratify, + ) + else: + X_train, y_train, sample_weight_train = X, y, sample_weight - X_train, y_train, X_calib, y_calib, sample_weight_train = results + X_train, y_train = cast(ArrayLike, X_train), cast(ArrayLike, y_train) + sample_weight_train = cast(ArrayLike, sample_weight_train) X_train, y_train = indexable(X_train, y_train) - X_calib, y_calib = indexable(X_calib, y_calib) - y_train, y_calib = _check_y(y_train), _check_y(y_calib) - self.n_calib_samples = _num_samples(y_calib) - check_alpha_and_n_samples(self.alpha, self.n_calib_samples) + y_train = _check_y(y_train) + sample_weight_train, X_train, y_train = check_null_weight( sample_weight_train, X_train, @@ -660,9 +630,6 @@ def fit_estimators( ) self.single_estimator_ = self.estimators_[2] - X_calib = cast(ArrayLike, X_calib) - y_calib = cast(ArrayLike, y_calib) - return X_calib, y_calib def conformalize( @@ -674,8 +641,15 @@ def conformalize( groups: Optional[ArrayLike] = None, **kwargs: Any, ) -> MapieRegressor: + if self.cv == "prefit": + self._initialize_and_check_prefit_estimators() + + X_calib, y_calib = cast(ArrayLike, X), cast(ArrayLike, y) + X_calib, y_calib = indexable(X_calib, y_calib) + y_calib = _check_y(y_calib) - self.n_calib_samples = _num_samples(y) + self.n_calib_samples = _num_samples(y_calib) + check_alpha_and_n_samples(self.alpha, self.n_calib_samples) y_calib_preds = np.full( shape=(3, self.n_calib_samples), @@ -683,15 +657,15 @@ def conformalize( ) for i, est in enumerate(self.estimators_): - y_calib_preds[i] = est.predict(X, **kwargs).ravel() + y_calib_preds[i] = est.predict(X_calib, **kwargs).ravel() self.conformity_scores_ = np.full( shape=(3, self.n_calib_samples), fill_value=np.nan ) - self.conformity_scores_[0] = y_calib_preds[0] - y - self.conformity_scores_[1] = y - y_calib_preds[1] + self.conformity_scores_[0] = y_calib_preds[0] - y_calib + self.conformity_scores_[1] = y_calib - y_calib_preds[1] self.conformity_scores_[2] = np.max( [ self.conformity_scores_[0], diff --git a/mapie/tests/test_quantile_regression.py b/mapie/tests/test_quantile_regression.py index 0ca88651e..871d62ccd 100644 --- a/mapie/tests/test_quantile_regression.py +++ b/mapie/tests/test_quantile_regression.py @@ -470,11 +470,13 @@ def test_for_small_dataset() -> None: estimator=qt, alpha=0.1 ) + X_calib_toy_small = X_calib_toy[:2] + y_calib_toy_small = y_calib_toy[:2] mapie_reg.fit( - np.array([1, 2, 3]), - np.array([2, 2, 3]), - X_calib=np.array([3, 5]), - y_calib=np.array([2, 3]) + X_train_toy, + y_train_toy, + X_calib=X_calib_toy_small, + y_calib=y_calib_toy_small )