FIX: merge with master

scikit-learn-contrib · Feb 26, 2024 · 05003e6 · 05003e6
1 parent cdf6cdd
commit 05003e6
Show file tree

Hide file tree

Showing 13 changed files with 305 additions and 94 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -37,4 +37,5 @@ Contributors
 * Rafael Saraiva <[email protected]>
 * Mehdi Elion <[email protected]>
 * Sami Kaddani <[email protected]>
+* Pierre de Fréminville <pidefrem>
 To be continued ...
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -4,7 +4,17 @@ History
 
 ##### (##########)
 ------------------
+* Allow the use of `y` and `groups` arguments in cross validator methods `get_n_splits`
+  and `split` to enable more cv-split variants for :class:`~regression.regression.MapieRegressor`
+  and :class:`~classification.MapieClassifier`
+  (e.g. :class:`sklearn.model_selection.GroupKFold`, stratified continuous split).
+  This change adds the `groups` argument to the following methods:
+  :meth:`~estimator.interface.EnsembleEstimator.fit()`,
+  :meth:`~estimator.estimator.EnsembleRegressor.predict_calib()`, :meth:`~estimator.estimator.EnsembleRegressor.fit()`,
+  :meth:`~regression.regression.MapieRegressor.fit()`,
+  :meth:`~classification.MapieClassifier.fit()`.
 * Add possibility of passing fit parameters used by estimators.
+* Fix memory issue CQR when testing for upper and lower bounds.
 
 0.8.0 (2024-01-03)
 ------------------

diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: tests doc build
 
-lint:	
+lint:
 	flake8 . --exclude=doc
 
 type-check:

diff --git a/mapie/classification.py b/mapie/classification.py
@@ -1053,6 +1053,7 @@ def fit(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         size_raps: Optional[float] = .2,
+        groups: Optional[ArrayLike] = None,
         **fit_params,
     ) -> MapieClassifier:
         """
@@ -1081,10 +1082,15 @@ def fit(
 
             By default ``.2``.
 
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+            By default ``None``.
+
         **fit_params : dict
             Additional fit parameters.
 
-
         Returns
         -------
         MapieClassifier
@@ -1099,6 +1105,7 @@ def fit(
         y = _check_y(y)
 
         sample_weight = cast(Optional[NDArray], sample_weight)
+        groups = cast(Optional[NDArray], groups)
         sample_weight, X, y = check_null_weight(sample_weight, X, y)
 
         y = cast(NDArray, y)
@@ -1147,6 +1154,9 @@ def fit(
             if sample_weight is not None:
                 sample_weight = sample_weight[train_raps_index]
                 sample_weight = cast(NDArray, sample_weight)
+            if groups is not None:
+                groups = groups[train_raps_index]
+                groups = cast(NDArray, groups)
 
         # Work
         if cv == "prefit":
@@ -1174,7 +1184,9 @@ def fit(
                     sample_weight,
                     **fit_params,
                 )
-                for k, (train_index, val_index) in enumerate(cv.split(X))
+                for k, (train_index, val_index) in enumerate(
+                    cv.split(X, y_enc, groups)
+                )
             )
             (
                 self.estimators_,

diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py
@@ -330,7 +330,12 @@ def _pred_multi(self, X: ArrayLike) -> NDArray:
         y_pred_multi = self._aggregate_with_mask(y_pred_multi, self.k_)
         return y_pred_multi
 
-    def predict_calib(self, X: ArrayLike) -> NDArray:
+    def predict_calib(
+        self,
+        X: ArrayLike,
+        y: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None
+    ) -> NDArray:
         """
         Perform predictions on X : the calibration set.
 
@@ -339,6 +344,17 @@ def predict_calib(self, X: ArrayLike) -> NDArray:
         X: ArrayLike of shape (n_samples_test, n_features)
             Input data
 
+        y: Optional[ArrayLike] of shape (n_samples_test,)
+            Input labels.
+
+            By default ``None``.
+
+        groups: Optional[ArrayLike] of shape (n_samples_test,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+            By default ``None``.
+
         Returns
         -------
         NDArray of shape (n_samples_test, 1)
@@ -357,15 +373,17 @@ def predict_calib(self, X: ArrayLike) -> NDArray:
                     delayed(self._predict_oof_estimator)(
                         estimator, X, calib_index,
                     )
-                    for (_, calib_index), estimator in zip(cv.split(X),
-                                                           self.estimators_)
+                    for (_, calib_index), estimator in zip(
+                        cv.split(X, y, groups),
+                        self.estimators_
+                    )
                 )
                 predictions, indices = map(
                     list, zip(*outputs)
                 )
                 n_samples = _num_samples(X)
                 pred_matrix = np.full(
-                    shape=(n_samples, cv.get_n_splits(X)),
+                    shape=(n_samples, cv.get_n_splits(X, y, groups)),
                     fill_value=np.nan,
                     dtype=float,
                 )
@@ -385,6 +403,7 @@ def fit(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None,
         **fit_params,
     ) -> EnsembleRegressor:
         """
@@ -404,6 +423,13 @@ def fit(
 
         sample_weight: Optional[ArrayLike] of shape (n_samples,)
             Sample weights. If None, then samples are equally weighted.
+
+            By default ``None``.
+
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
             By default ``None``.
 
         **fit_params : dict
@@ -440,7 +466,7 @@ def fit(
             )
             cv = cast(BaseCrossValidator, cv)
             self.k_ = np.full(
-                shape=(n_samples, cv.get_n_splits(X, y)),
+                shape=(n_samples, cv.get_n_splits(X, y, groups)),
                 fill_value=np.nan,
                 dtype=float,
             )
@@ -456,7 +482,7 @@ def fit(
                         sample_weight,
                         **fit_params
                     )
-                    for train_index, _ in cv.split(X)
+                    for train_index, _ in cv.split(X, y, groups)
                 )
                 # In split-CP, we keep only the model fitted on train dataset
                 if self.use_split_method_:

diff --git a/mapie/estimator/interface.py b/mapie/estimator/interface.py
@@ -21,6 +21,7 @@ def fit(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None,
         **fit_params
     ) -> EnsembleEstimator:
         """
@@ -42,6 +43,11 @@ def fit(
             Sample weights. If None, then samples are equally weighted.
             By default ``None``.
 
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+            By default ``None``.
+
         **fit_params : dict
             Additional fit parameters.
 

diff --git a/mapie/regression/quantile_regression.py b/mapie/regression/quantile_regression.py
@@ -463,6 +463,7 @@ def fit(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None,
         X_calib: Optional[ArrayLike] = None,
         y_calib: Optional[ArrayLike] = None,
         calib_size: Optional[float] = 0.3,
@@ -499,6 +500,9 @@ def fit(
 
             By default ``None``.
 
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Always ignored, exists for compatibility.
+
         X_calib: Optional[ArrayLike] of shape (n_calib_samples, n_features)
             Calibration data.
 
@@ -696,6 +700,7 @@ def predict(
         )
         for i, est in enumerate(self.estimators_):
             y_preds[i] = est.predict(X)
+        check_lower_upper_bounds(y_preds[0], y_preds[1], y_preds[2])
         if symmetry:
             quantile = np.full(
                 2,
@@ -716,5 +721,5 @@ def predict(
             )
         y_pred_low = y_preds[0][:, np.newaxis] - quantile[0]
         y_pred_up = y_preds[1][:, np.newaxis] + quantile[1]
-        check_lower_upper_bounds(y_preds, y_pred_low, y_pred_up)
+        check_lower_upper_bounds(y_pred_low, y_pred_up, y_preds[2])
         return y_preds[2], np.stack([y_pred_low, y_pred_up], axis=1)
diff --git a/mapie/regression/regression.py b/mapie/regression/regression.py
@@ -392,6 +392,7 @@ def _check_fit_parameters(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None
     ):
         """
         Perform several checks on class parameters.
@@ -407,6 +408,11 @@ def _check_fit_parameters(
         sample_weight: Optional[NDArray] of shape (n_samples,)
             Non-null sample weights.
 
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+            By default ``None``.
+
         Raises
         ------
         ValueError
@@ -449,14 +455,21 @@ def _check_fit_parameters(
         X = cast(NDArray, X)
         y = cast(NDArray, y)
         sample_weight = cast(Optional[NDArray], sample_weight)
+        groups = cast(Optional[NDArray], groups)
 
-        return estimator, cs_estimator, agg_function, cv, X, y, sample_weight
+        return (
+            estimator, cs_estimator,
+            agg_function, cv,
+            X, y,
+            sample_weight, groups
+        )
 
     def fit(
         self,
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        groups: Optional[ArrayLike] = None,
         **fit_params,
     ) -> MapieRegressor:
         """
@@ -485,6 +498,11 @@ def fit(
 
             By default ``None``.
 
+        groups: Optional[ArrayLike] of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+            By default ``None``.
+
         **fit_params : dict
             Additional fit parameters.
 
@@ -500,7 +518,8 @@ def fit(
          cv,
          X,
          y,
-         sample_weight) = self._check_fit_parameters(X, y, sample_weight)
+         sample_weight,
+         groups) = self._check_fit_parameters(X, y, sample_weight, groups)
 
         self.estimator_ = EnsembleRegressor(
             estimator,
@@ -514,11 +533,11 @@ def fit(
         )
         # Fit the prediction function
         self.estimator_ = self.estimator_.fit(
-            X, y, sample_weight, **fit_params
+            X, y, sample_weight=sample_weight, groups=groups, **fit_params
         )
 
         # Predict on calibration data
-        y_pred = self.estimator_.predict_calib(X)
+        y_pred = self.estimator_.predict_calib(X, y=y, groups=groups)
 
         # Compute the conformity scores (manage jk-ab case)
         self.conformity_scores_ = \

diff --git a/mapie/subsample.py b/mapie/subsample.py
@@ -56,7 +56,7 @@ def __init__(
         self.random_state = random_state
 
     def split(
-        self, X: NDArray
+        self, X: NDArray, *args: Any, **kargs: Any
     ) -> Generator[Tuple[NDArray, NDArray], None, None]:
         """
         Generate indices to split data into training and test sets.
@@ -154,7 +154,7 @@ def __init__(
         self.random_state = random_state
 
     def split(
-        self, X: NDArray
+        self, X: NDArray, *args: Any, **kargs: Any
     ) -> Generator[Tuple[NDArray, NDArray], None, None]:
         """
         Generate indices to split data into training and test sets.