From e62eb237cba2d08cc239c602a0df8ac7dc251181 Mon Sep 17 00:00:00 2001 From: chendingyan Date: Fri, 5 Mar 2021 16:48:21 +0800 Subject: [PATCH 1/6] [FIX] Fix regressor y value check --- deepforest/cascade.py | 191 +++++++++++++++++++++++------------------- 1 file changed, 107 insertions(+), 84 deletions(-) diff --git a/deepforest/cascade.py b/deepforest/cascade.py index 89aa29b..e4e98a7 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1,6 +1,5 @@ """Implementation of Deep Forest.""" - __all__ = ["CascadeForestClassifier", "CascadeForestRegressor"] import time @@ -27,15 +26,15 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict: def _build_classifier_predictor( - predictor_name, - criterion, - n_estimators, - n_outputs, - max_depth=None, - min_samples_leaf=1, - n_jobs=None, - random_state=None, - predictor_kwargs={}, + predictor_name, + criterion, + n_estimators, + n_outputs, + max_depth=None, + min_samples_leaf=1, + n_jobs=None, + random_state=None, + predictor_kwargs={}, ): """Build the predictor concatenated to the deep forest.""" predictor_name = predictor_name.lower() @@ -111,15 +110,15 @@ def _build_classifier_predictor( def _build_regressor_predictor( - predictor_name, - criterion, - n_estimators, - n_outputs, - max_depth=None, - min_samples_leaf=1, - n_jobs=None, - random_state=None, - predictor_kwargs={}, + predictor_name, + criterion, + n_estimators, + n_outputs, + max_depth=None, + min_samples_leaf=1, + n_jobs=None, + random_state=None, + predictor_kwargs={}, ): """Build the predictor concatenated to the deep forest.""" predictor_name = predictor_name.lower() @@ -278,7 +277,6 @@ def _build_regressor_predictor( - If ``> 1``, full logging information will be displayed. """ - __classifier_fit_doc = """ .. note:: @@ -456,26 +454,26 @@ def adddoc(cls): class BaseCascadeForest(BaseEstimator, metaclass=ABCMeta): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): self.n_bins = n_bins self.bin_subsample = bin_subsample @@ -674,7 +672,7 @@ def _handle_early_stopping(self): Remove cascade layers temporarily added, along with dumped objects on the local buffer if `partial_mode` is True.""" for layer_idx in range( - self.n_layers_ - 1, self.n_layers_ - self.n_tolerant_rounds, -1 + self.n_layers_ - 1, self.n_layers_ - self.n_tolerant_rounds, -1 ): self.layers_.pop("layer_{}".format(layer_idx)) self.binners_.pop("binner_{}".format(layer_idx)) @@ -1169,26 +1167,26 @@ def clean(self): ) class CascadeForestClassifier(BaseCascadeForest, ClassifierMixin): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="gini", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="gini", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): super().__init__( n_bins=n_bins, @@ -1364,26 +1362,26 @@ def predict(self, X): ) class CascadeForestRegressor(BaseCascadeForest, RegressorMixin): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="mse", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="mse", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): super().__init__( n_bins=n_bins, @@ -1413,18 +1411,43 @@ def __init__( def _check_target_values(self, y): """ Check the input target values for regressor. + e.g. + np.array(['a','b','a','c']) will be regarded as "multiclass" but not pass self._check_array_numeric test + np.array(['a','1',1,2]) will be regarded as "multiclass" but not pass self._check_array_numeric test + np.array([1,2,1,3]) will be regarded as "multiclass" and pass self._check_array_numeric test + np.array([1.0,2.0,3.3]) will be regarded as "continuous" and pass self._check_array_numeric test """ self.type_of_target_ = type_of_target(y) if self.type_of_target_ not in ( - "continuous", - "continuous-multioutput", - ): + "continuous", + "continuous-multioutput", + "multiclass", + "multiclass-multioutput" + ) and self._check_array_numeric(y): msg = ( "CascadeForestRegressor is used for univariate or multi-variate regression," " but the target values seem not to be one of them." ) raise ValueError(msg) + def _check_array_numeric(self, y): + """ + check the input numpy array y is all numeric + + Parameters + ---------- + y: numpy array + + Returns + ------- + bool, True if array contains all numbers else False + """ + + if y.dtype.kind in np.typecodes['AllInteger'] + np.typecodes["AllFloat"]: + return True + else: + return False + def _repr_performance(self, pivot): msg = "Val MSE = {:.5f}" return msg.format(pivot) From 342853858c68c3a049bd5e42990a0b8027c26b30 Mon Sep 17 00:00:00 2001 From: chendingyan Date: Fri, 5 Mar 2021 16:57:23 +0800 Subject: [PATCH 2/6] [FIX] Revert Code Quality --- deepforest/cascade.py | 161 +++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 79 deletions(-) diff --git a/deepforest/cascade.py b/deepforest/cascade.py index e4e98a7..f65ff47 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1,5 +1,6 @@ """Implementation of Deep Forest.""" + __all__ = ["CascadeForestClassifier", "CascadeForestRegressor"] import time @@ -26,15 +27,15 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict: def _build_classifier_predictor( - predictor_name, - criterion, - n_estimators, - n_outputs, - max_depth=None, - min_samples_leaf=1, - n_jobs=None, - random_state=None, - predictor_kwargs={}, + predictor_name, + criterion, + n_estimators, + n_outputs, + max_depth=None, + min_samples_leaf=1, + n_jobs=None, + random_state=None, + predictor_kwargs={}, ): """Build the predictor concatenated to the deep forest.""" predictor_name = predictor_name.lower() @@ -110,15 +111,15 @@ def _build_classifier_predictor( def _build_regressor_predictor( - predictor_name, - criterion, - n_estimators, - n_outputs, - max_depth=None, - min_samples_leaf=1, - n_jobs=None, - random_state=None, - predictor_kwargs={}, + predictor_name, + criterion, + n_estimators, + n_outputs, + max_depth=None, + min_samples_leaf=1, + n_jobs=None, + random_state=None, + predictor_kwargs={}, ): """Build the predictor concatenated to the deep forest.""" predictor_name = predictor_name.lower() @@ -277,6 +278,7 @@ def _build_regressor_predictor( - If ``> 1``, full logging information will be displayed. """ + __classifier_fit_doc = """ .. note:: @@ -454,26 +456,26 @@ def adddoc(cls): class BaseCascadeForest(BaseEstimator, metaclass=ABCMeta): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): self.n_bins = n_bins self.bin_subsample = bin_subsample @@ -672,7 +674,7 @@ def _handle_early_stopping(self): Remove cascade layers temporarily added, along with dumped objects on the local buffer if `partial_mode` is True.""" for layer_idx in range( - self.n_layers_ - 1, self.n_layers_ - self.n_tolerant_rounds, -1 + self.n_layers_ - 1, self.n_layers_ - self.n_tolerant_rounds, -1 ): self.layers_.pop("layer_{}".format(layer_idx)) self.binners_.pop("binner_{}".format(layer_idx)) @@ -1167,26 +1169,26 @@ def clean(self): ) class CascadeForestClassifier(BaseCascadeForest, ClassifierMixin): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="gini", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="gini", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): super().__init__( n_bins=n_bins, @@ -1362,26 +1364,26 @@ def predict(self, X): ) class CascadeForestRegressor(BaseCascadeForest, RegressorMixin): def __init__( - self, - n_bins=255, - bin_subsample=200000, - bin_type="percentile", - max_layers=20, - criterion="mse", - n_estimators=2, - n_trees=100, - max_depth=None, - min_samples_leaf=1, - use_predictor=False, - predictor="forest", - predictor_kwargs={}, - backend="custom", - n_tolerant_rounds=2, - delta=1e-5, - partial_mode=False, - n_jobs=None, - random_state=None, - verbose=1, + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="mse", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, ): super().__init__( n_bins=n_bins, @@ -1448,6 +1450,7 @@ def _check_array_numeric(self, y): else: return False + def _repr_performance(self, pivot): msg = "Val MSE = {:.5f}" return msg.format(pivot) From a775dbdff2a1f55d656991c54a433392b65f3735 Mon Sep 17 00:00:00 2001 From: chendingyan Date: Fri, 5 Mar 2021 17:01:52 +0800 Subject: [PATCH 3/6] [FIX] Fix Logic --- deepforest/cascade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepforest/cascade.py b/deepforest/cascade.py index f65ff47..b723443 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1425,7 +1425,7 @@ def _check_target_values(self, y): "continuous-multioutput", "multiclass", "multiclass-multioutput" - ) and self._check_array_numeric(y): + ) or not self._check_array_numeric(y): msg = ( "CascadeForestRegressor is used for univariate or multi-variate regression," " but the target values seem not to be one of them." From 4708e1731720646cc46cdefe5b23b962698382c4 Mon Sep 17 00:00:00 2001 From: chendingyan Date: Fri, 5 Mar 2021 17:09:10 +0800 Subject: [PATCH 4/6] [FIX] Reformat to pass code quality check --- deepforest/cascade.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepforest/cascade.py b/deepforest/cascade.py index b723443..05ba422 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1421,10 +1421,10 @@ def _check_target_values(self, y): """ self.type_of_target_ = type_of_target(y) if self.type_of_target_ not in ( - "continuous", - "continuous-multioutput", - "multiclass", - "multiclass-multioutput" + "continuous", + "continuous-multioutput", + "multiclass", + "multiclass-multioutput" ) or not self._check_array_numeric(y): msg = ( "CascadeForestRegressor is used for univariate or multi-variate regression," From a96cdf236c2c0eecab28a611e2028798b09a5a99 Mon Sep 17 00:00:00 2001 From: xuyxu Date: Fri, 5 Mar 2021 18:45:51 +0800 Subject: [PATCH 5/6] refactor some code to pass ci --- deepforest/cascade.py | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/deepforest/cascade.py b/deepforest/cascade.py index 05ba422..92200bd 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1411,46 +1411,37 @@ def __init__( self.type_of_target_ = None def _check_target_values(self, y): - """ - Check the input target values for regressor. - e.g. - np.array(['a','b','a','c']) will be regarded as "multiclass" but not pass self._check_array_numeric test - np.array(['a','1',1,2]) will be regarded as "multiclass" but not pass self._check_array_numeric test - np.array([1,2,1,3]) will be regarded as "multiclass" and pass self._check_array_numeric test - np.array([1.0,2.0,3.3]) will be regarded as "continuous" and pass self._check_array_numeric test - """ + """Check the input target values for regressor.""" self.type_of_target_ = type_of_target(y) + + if not self._check_array_numeric(y): + msg = ( + "CascadeForestRegressor only accepts numeric values as" + " valid target values." + ) + raise ValueError(msg) + if self.type_of_target_ not in ( "continuous", "continuous-multioutput", "multiclass", - "multiclass-multioutput" - ) or not self._check_array_numeric(y): + "multiclass-multioutput", + ): msg = ( - "CascadeForestRegressor is used for univariate or multi-variate regression," - " but the target values seem not to be one of them." + "CascadeForestRegressor is used for univariate or" + " multi-variate regression, but the target values seem not" + " to be one of them." ) raise ValueError(msg) def _check_array_numeric(self, y): - """ - check the input numpy array y is all numeric - - Parameters - ---------- - y: numpy array - - Returns - ------- - bool, True if array contains all numbers else False - """ - - if y.dtype.kind in np.typecodes['AllInteger'] + np.typecodes["AllFloat"]: + """Check the input numpy array y is all numeric.""" + numeric_types = np.typecodes['AllInteger'] + np.typecodes["AllFloat"] + if y.dtype.kind in numeric_types: return True else: return False - def _repr_performance(self, pivot): msg = "Val MSE = {:.5f}" return msg.format(pivot) From 9b8ded345a3b93465f091323a74a63c5cb0ad60f Mon Sep 17 00:00:00 2001 From: xuyxu Date: Mon, 8 Mar 2021 11:06:36 +0800 Subject: [PATCH 6/6] Update CHANGELOG.rst --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 17dd86e..4540840 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -30,6 +30,7 @@ Version 0.1.* .. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}` .. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}` +- |Enhancement| improve target checks for :obj:`CascadeForestRegressor` (`#53 `__) @chendingyan - |Fix| fix inconsistency on predictor name (`#52 `__) @xuyxu - |Feature| add official support for ManyLinux-aarch64 (`#47 `__) @xuyxu - |Fix| fix accepted types of target for :obj:`CascadeForestRegressor` (`#44 `__) @xuyxu