From 2f63bd3a6d6bb20378b225474387eb94cb06781e Mon Sep 17 00:00:00 2001 From: Alfonso Tobar <48638337+datacubeR@users.noreply.github.com> Date: Thu, 23 Mar 2023 04:21:49 -0300 Subject: [PATCH 1/6] Add code examples in timeseries module's docstrings (#647) * Adding code examples for Timeseries Module * Fixing details in the examples --- .../forecasting/expanding_window_features.py | 22 +++++++++++++++++++ .../timeseries/forecasting/lag_features.py | 22 +++++++++++++++++++ .../timeseries/forecasting/window_features.py | 22 +++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 3cabf0142..c8306c085 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -117,6 +117,28 @@ class ExpandingWindowFeatures(BaseForecastTransformer): pandas.expanding pandas.aggregate pandas.shift + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures + >>> X = pd.DataFrame(dict(date = ["2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22"], + >>> x1 = [1,2,3,4,5], + >>> x2 = [6,7,8,9,10] + >>> )) + >>> ewf = ExpandingWindowFeatures() + >>> ewf.fit_transform(X) + date x1 x2 x1_expanding_mean x2_expanding_mean + 0 2022-09-18 1 6 NaN NaN + 1 2022-09-19 2 7 1.0 6.0 + 2 2022-09-20 3 8 1.5 6.5 + 3 2022-09-21 4 9 2.0 7.0 + 4 2022-09-22 5 10 2.5 7.5 """ def __init__( diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 6abccb949..a7a1cef26 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -95,6 +95,28 @@ class LagFeatures(BaseForecastTransformer): See Also -------- pandas.shift + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.timeseries.forecasting import LagFeatures + >>> X = pd.DataFrame(dict(date = ["2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22"], + >>> x1 = [1,2,3,4,5], + >>> x2 = [6,7,8,9,10] + >>> )) + >>> lf = LagFeatures(periods=[1,2]) + >>> lf.fit_transform(X) + date x1 x2 x1_lag_1 x2_lag_1 x1_lag_2 x2_lag_2 + 0 2022-09-18 1 6 NaN NaN NaN NaN + 1 2022-09-19 2 7 1.0 6.0 NaN NaN + 2 2022-09-20 3 8 2.0 7.0 1.0 6.0 + 3 2022-09-21 4 9 3.0 8.0 2.0 7.0 + 4 2022-09-22 5 10 4.0 9.0 3.0 8.0 """ def __init__( diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index dc17123fb..f3c4e90d8 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -121,6 +121,28 @@ class WindowFeatures(BaseForecastTransformer): pandas.rolling pandas.aggregate pandas.shift + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.timeseries.forecasting import WindowFeatures + >>> X = pd.DataFrame(dict(date = ["2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22"], + >>> x1 = [1,2,3,4,5], + >>> x2 = [6,7,8,9,10] + >>> )) + >>> wf = WindowFeatures(window = 2) + >>> wf.fit_transform(X) + date x1 x2 x1_window_2_mean x2_window_2_mean + 0 2022-09-18 1 6 NaN NaN + 1 2022-09-19 2 7 NaN NaN + 2 2022-09-20 3 8 1.5 6.5 + 3 2022-09-21 4 9 2.5 7.5 + 4 2022-09-22 5 10 3.5 8.5 """ def __init__( From 3fa9c8a53d8b35e71a11d45c2390153c79ba405c Mon Sep 17 00:00:00 2001 From: Alfonso Tobar <48638337+datacubeR@users.noreply.github.com> Date: Thu, 23 Mar 2023 04:28:36 -0300 Subject: [PATCH 2/6] Add code examples in transformation module's docstrings (#646) * Adding code examples for Transformation module * Fixing details in the examples --- feature_engine/transformation/arcsin.py | 18 +++++++++++ feature_engine/transformation/boxcox.py | 19 +++++++++++ feature_engine/transformation/log.py | 36 +++++++++++++++++++++ feature_engine/transformation/power.py | 18 +++++++++++ feature_engine/transformation/reciprocal.py | 18 +++++++++++ feature_engine/transformation/yeojohnson.py | 23 +++++++++++-- 6 files changed, 130 insertions(+), 2 deletions(-) diff --git a/feature_engine/transformation/arcsin.py b/feature_engine/transformation/arcsin.py index 7dcad3a18..0cb9c80d1 100644 --- a/feature_engine/transformation/arcsin.py +++ b/feature_engine/transformation/arcsin.py @@ -80,6 +80,24 @@ class ArcsinTransformer(BaseNumericalTransformer): transform: Apply the arcsin transformation. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import ArcsinTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.beta(1, 1, size = 100))) + >>> ast = ArcsinTransformer() + >>> ast.fit(X) + >>> X = ast.transform(X) + >>> X.head() + x + 0 0.785437 + 1 0.253389 + 2 0.144664 + 3 0.783236 + 4 0.650777 """ def __init__( diff --git a/feature_engine/transformation/boxcox.py b/feature_engine/transformation/boxcox.py index fa9de5caa..9597e5f79 100644 --- a/feature_engine/transformation/boxcox.py +++ b/feature_engine/transformation/boxcox.py @@ -93,6 +93,25 @@ class BoxCoxTransformer(BaseNumericalTransformer): .. [1] Box and Cox. "An Analysis of Transformations". Read at a RESEARCH MEETING, 1964. https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1964.tb00553.x + + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import BoxCoxTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100))) + >>> bct = BoxCoxTransformer() + >>> bct.fit(X) + >>> X = bct.transform(X) + >>> X.head() + x + 0 0.505485 + 1 -0.137595 + 2 0.662654 + 3 1.607518 + 4 -0.232237 """ def __init__( diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index 2c1a466ce..b94269220 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -77,6 +77,24 @@ class LogTransformer(BaseNumericalTransformer): transform: Transform the variables using the logarithm. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import LogTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100))) + >>> lt = LogTransformer() + >>> lt.fit(X) + >>> X = lt.transform(X) + >>> X.head() + x + 0 0.496714 + 1 -0.138264 + 2 0.647689 + 3 1.523030 + 4 -0.234153 """ def __init__( @@ -263,6 +281,24 @@ class LogCpTransformer(BaseNumericalTransformer, FitFromDictMixin): transform: Transform the variables with the logarithm of x plus C. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import LogCpTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100))) + >>> lct = LogCpTransformer() + >>> lct.fit(X) + >>> X = lct.transform(X) + >>> X.head() + x + 0 0.944097 + 1 0.586701 + 2 1.043204 + 3 1.707159 + 4 0.541405 """ def __init__( diff --git a/feature_engine/transformation/power.py b/feature_engine/transformation/power.py index c318e1d6b..456b5d873 100644 --- a/feature_engine/transformation/power.py +++ b/feature_engine/transformation/power.py @@ -74,6 +74,24 @@ class PowerTransformer(BaseNumericalTransformer): transform: Apply the power transformation to the variables. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import PowerTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100))) + >>> pt = PowerTransformer() + >>> pt.fit(X) + >>> X = pt.transform(X) + >>> X.head() + x + 0 1.281918 + 1 0.933203 + 2 1.382432 + 3 2.141518 + 4 0.889517 """ def __init__( diff --git a/feature_engine/transformation/reciprocal.py b/feature_engine/transformation/reciprocal.py index 4ad399004..05546952e 100644 --- a/feature_engine/transformation/reciprocal.py +++ b/feature_engine/transformation/reciprocal.py @@ -73,6 +73,24 @@ class ReciprocalTransformer(BaseNumericalTransformer): transform: Apply the reciprocal 1 / x transformation. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import ReciprocalTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = 10 - np.random.exponential(size = 100))) + >>> rt = ReciprocalTransformer() + >>> rt.fit(X) + >>> X = rt.transform(X) + >>> X.head() + x + 0 0.104924 + 1 0.143064 + 2 0.115164 + 3 0.110047 + 4 0.101726 """ def __init__( diff --git a/feature_engine/transformation/yeojohnson.py b/feature_engine/transformation/yeojohnson.py index fb6b1586a..c658fd502 100644 --- a/feature_engine/transformation/yeojohnson.py +++ b/feature_engine/transformation/yeojohnson.py @@ -74,11 +74,30 @@ class YeoJohnsonTransformer(BaseNumericalTransformer): References ---------- .. [1] Yeo, In-Kwon and Johnson, Richard (2000). - A new family of power transformations to improve normality or symmetry. - Biometrika, 87, 954-959. + A new family of power transformations to improve normality or symmetry. + Biometrika, 87, 954-959. .. [2] Weisberg S. "Yeo-Johnson Power Transformations". https://www.stat.umn.edu/arc/yjpower.pdf + + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import YeoJohnsonTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100) - 10)) + >>> yjt = YeoJohnsonTransformer() + >>> yjt.fit(X) + >>> X = yjt.transform(X) + >>> X.head() + x + 0 -267042.906453 + 1 -444357.138990 + 2 -221626.115742 + 3 -23647.632651 + 4 -467264.993249 """ def __init__( From 1d9ccc47976cc1d2f4ce36396e629594edda189c Mon Sep 17 00:00:00 2001 From: Alfonso Tobar <48638337+datacubeR@users.noreply.github.com> Date: Thu, 23 Mar 2023 04:40:21 -0300 Subject: [PATCH 3/6] Add code examples in outliers module's docstrings (#644) * Adding code examples for Winsorizer, ArbitraryCapper and Trimmer * Fixing details in the examples * add display output to example --------- Co-authored-by: Soledad Galli --- feature_engine/outliers/artbitrary.py | 21 ++++++++++ feature_engine/outliers/trimmer.py | 55 +++++++++++++++++++++++++++ feature_engine/outliers/winsorizer.py | 42 ++++++++++++++++++++ 3 files changed, 118 insertions(+) diff --git a/feature_engine/outliers/artbitrary.py b/feature_engine/outliers/artbitrary.py index d010aecd6..ceb7f2cfc 100644 --- a/feature_engine/outliers/artbitrary.py +++ b/feature_engine/outliers/artbitrary.py @@ -91,6 +91,27 @@ class ArbitraryOutlierCapper(BaseOutlier): transform: Cap the variables. + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.outliers import ArbitraryOutlierCapper + >>> X = pd.DataFrame(dict(x1 = [1,2,3,4,5,6,7,8,9,10])) + >>> aoc = ArbitraryOutlierCapper(max_capping_dict=dict(x1 = 8), + >>> min_capping_dict=dict(x1 = 2)) + >>> aoc.fit(X) + >>> aoc.transform(X) + x1 + 0 2 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 7 + 7 8 + 8 8 + 9 8 """ def __init__( diff --git a/feature_engine/outliers/trimmer.py b/feature_engine/outliers/trimmer.py index 55afbb579..897c91634 100644 --- a/feature_engine/outliers/trimmer.py +++ b/feature_engine/outliers/trimmer.py @@ -89,6 +89,61 @@ class OutlierTrimmer(WinsorizerBase): transform: Remove outliers. + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.outliers import OutlierTrimmer + >>> X = pd.DataFrame(dict(x = [0.49671, + >>> -0.1382, + >>> 0.64768, + >>> 1.52302, + >>> -0.2341, + >>> -17.2341, + >>> 1.57921, + >>> 0.76743, + >>> -0.4694, + >>> 0.54256])) + >>> ot = OutlierTrimmer(capping_method='gaussian', tail='left', fold=3) + >>> ot.fit(X) + >>> ot.transform(X) + x + 0 0.49671 + 1 -0.13820 + 2 0.64768 + 3 1.52302 + 4 -0.23410 + 5 -17.23410 + 6 1.57921 + 7 0.76743 + 8 -0.46940 + 9 0.54256 + + >>> import pandas as pd + >>> from feature_engine.outliers import OutlierTrimmer + >>> X = pd.DataFrame(dict(x = [0.49671, + >>> -0.1382, + >>> 0.64768, + >>> 1.52302, + >>> -0.2341, + >>> -17.2341, + >>> 1.57921, + >>> 0.76743, + >>> -0.4694, + >>> 0.54256])) + >>> ot = OutlierTrimmer(capping_method='mad', tail='left', fold=3) + >>> ot.fit(X) + >>> ot.transform(X) + x + 0 0.49671 + 1 -0.13820 + 2 0.64768 + 3 1.52302 + 4 -0.23410 + 6 1.57921 + 7 0.76743 + 8 -0.46940 + 9 0.54256 """ def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/outliers/winsorizer.py b/feature_engine/outliers/winsorizer.py index 92480438d..4a44e6052 100644 --- a/feature_engine/outliers/winsorizer.py +++ b/feature_engine/outliers/winsorizer.py @@ -97,6 +97,48 @@ class Winsorizer(WinsorizerBase): transform: Cap the variables. + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.outliers import Winsorizer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.normal(size = 10))) + >>> wz = Winsorizer(capping_method='mad', tail='both', fold=3) + >>> wz.fit(X) + >>> wz.transform(X) + x + 0 0.496714 + 1 -0.138264 + 2 0.647689 + 3 1.523030 + 4 -0.234153 + 5 -0.234137 + 6 1.579213 + 7 0.767435 + 8 -0.469474 + 9 0.542560 + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.outliers import Winsorizer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.normal(size = 10))) + >>> wz = Winsorizer(capping_method='mad', tail='both', fold=3) + >>> wz.fit(X) + >>> wz.transform(X) + x + 0 0.496714 + 1 -0.138264 + 2 0.647689 + 3 1.523030 + 4 -0.234153 + 5 -0.234137 + 6 1.579213 + 7 0.767435 + 8 -0.469474 + 9 0.542560 """ def __init__( From 9d4db3ea974dc6a2ea004391caf77ded55d262a7 Mon Sep 17 00:00:00 2001 From: Alfonso Tobar <48638337+datacubeR@users.noreply.github.com> Date: Thu, 23 Mar 2023 04:46:06 -0300 Subject: [PATCH 4/6] Add code examples in preprocessing and wrappers modules' docstrings (#643) * Adding code examples for SkWrapper, MatchCategories, and MatchVariables * Fixing details in the examples * modify sparse parameter in ohe --------- Co-authored-by: datacubeR Co-authored-by: Soledad Galli --- .../preprocessing/match_categories.py | 21 +++++++++ feature_engine/preprocessing/match_columns.py | 44 +++++++++++++++++++ feature_engine/wrappers/wrappers.py | 40 +++++++++++++++++ 3 files changed, 105 insertions(+) diff --git a/feature_engine/preprocessing/match_categories.py b/feature_engine/preprocessing/match_categories.py index f75709c3f..e4be96251 100644 --- a/feature_engine/preprocessing/match_categories.py +++ b/feature_engine/preprocessing/match_categories.py @@ -88,6 +88,27 @@ class MatchCategories( transform: Enforce the type of categorical variables as dtype `categorical`. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.preprocessing import MatchCategories + >>> X_train = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [4,5,6])) + >>> X_test = pd.DataFrame(dict(x1 = ["c","b","a","d"], x2 = [5,6,4,7])) + >>> mc = MatchCategories(missing_values="ignore") + >>> mc.fit(X_train) + >>> mc.transform(X_train) + x1 x2 + 0 a 4 + 1 b 5 + 2 c 6 + >>> mc.transform(X_test) + x1 x2 + 0 c 5 + 1 b 6 + 2 a 4 + 3 NaN 7 """ def __init__( diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py index ff2b4f22a..c40243d13 100644 --- a/feature_engine/preprocessing/match_columns.py +++ b/feature_engine/preprocessing/match_columns.py @@ -100,6 +100,50 @@ class MatchVariables(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin): transform: Add or delete variables to match those observed in the train set. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.preprocessing import MatchVariables + >>> X_train = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [4,5,6])) + >>> X_test = pd.DataFrame(dict(x1 = ["c","b","a","d"], + >>> x2 = [5,6,4,7], + >>> x3 = [1,1,1,1])) + >>> mv = MatchVariables(missing_values="ignore") + >>> mv.fit(X_train) + >>> mv.transform(X_train) + x1 x2 + 0 a 4 + 1 b 5 + 2 c 6 + >>> mv.transform(X_test) + The following variables are dropped from the DataFrame: ['x3'] + x1 x2 + 0 c 5 + 1 b 6 + 2 a 4 + 3 d 7 + + >>> import pandas as pd + >>> from feature_engine.preprocessing import MatchVariables + >>> X_train = pd.DataFrame(dict(x1 = ["a","b","c"], + >>> x2 = [4,5,6], x3 = [1,1,1])) + >>> X_test = pd.DataFrame(dict(x1 = ["c","b","a","d"], x2 = [5,6,4,7])) + >>> mv = MatchVariables(missing_values="ignore") + >>> mv.fit(X_train) + >>> mv.transform(X_train) + x1 x2 x3 + 0 a 4 1 + 1 b 5 1 + 2 c 6 1 + >>> mv.transform(X_test) + The following variables are added to the DataFrame: ['x3'] + x1 x2 x3 + 0 c 5 NaN + 1 b 6 NaN + 2 a 4 NaN + 3 d 7 NaN """ def __init__( diff --git a/feature_engine/wrappers/wrappers.py b/feature_engine/wrappers/wrappers.py index de4e8aa46..a621a85b5 100644 --- a/feature_engine/wrappers/wrappers.py +++ b/feature_engine/wrappers/wrappers.py @@ -144,6 +144,46 @@ class SklearnTransformerWrapper(BaseEstimator, TransformerMixin): See Also -------- sklearn.compose.ColumnTransformer + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.wrappers import SklearnTransformerWrapper + >>> from sklearn.preprocessing import StandardScaler + >>> X = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [1,2,3], x3 = [4,5,6])) + >>> skw = SklearnTransformerWrapper(StandardScaler()) + >>> skw.fit(X) + >>> skw.transform(X) + x1 x2 x3 + 0 a -1.224745 -1.224745 + 1 b 0.000000 0.000000 + 2 c 1.224745 1.224745 + + >>> import pandas as pd + >>> from feature_engine.wrappers import SklearnTransformerWrapper + >>> from sklearn.preprocessing import OneHotEncoder + >>> X = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [1,2,3], x3 = [4,5,6])) + >>> skw = SklearnTransformerWrapper( + >>> OneHotEncoder(sparse_output = False), variables = "x1") + >>> skw.fit(X) + >>> skw.transform(X) + x2 x3 x1_a x1_b x1_c + 0 1 4 1.0 0.0 0.0 + 1 2 5 0.0 1.0 0.0 + 2 3 6 0.0 0.0 1.0 + + >>> import pandas as pd + >>> from feature_engine.wrappers import SklearnTransformerWrapper + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [1,2,3], x3 = [4,5,6])) + >>> skw = SklearnTransformerWrapper(PolynomialFeatures(include_bias = False)) + >>> skw.fit(X) + >>> skw.transform(X) + x1 x2 x3 x2^2 x2 x3 x3^2 + 0 a 1.0 4.0 1.0 4.0 16.0 + 1 b 2.0 5.0 4.0 10.0 25.0 + 2 c 3.0 6.0 9.0 18.0 36.0 """ def __init__( From e73772d7529baf3599193a488a544e3d043ee6d3 Mon Sep 17 00:00:00 2001 From: Soledad Galli Date: Mon, 27 Mar 2023 19:05:12 +0200 Subject: [PATCH 5/6] Release 1.6 (#617) * release 1.6 * bump version * fixes typo in username * update what's new with latest commits * add code examples contribution * update what's new and fix typos --- .circleci/config.yml | 2 +- docs/whats_new/index.rst | 1 + docs/whats_new/v_160.rst | 94 ++++++++++++++++++++++++++++++++++++++++ feature_engine/VERSION | 2 +- 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 docs/whats_new/v_160.rst diff --git a/.circleci/config.yml b/.circleci/config.yml index d7fbb25c5..cd3e2bffc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -151,4 +151,4 @@ workflows: filters: branches: only: - - 1.5.X \ No newline at end of file + - 1.6.X \ No newline at end of file diff --git a/docs/whats_new/index.rst b/docs/whats_new/index.rst index 91cb58947..2a1106fb0 100644 --- a/docs/whats_new/index.rst +++ b/docs/whats_new/index.rst @@ -8,6 +8,7 @@ Find out what's new in each new version release. .. toctree:: :maxdepth: 2 + v_160 v_150 v_140 v_130 diff --git a/docs/whats_new/v_160.rst b/docs/whats_new/v_160.rst new file mode 100644 index 000000000..24eb89159 --- /dev/null +++ b/docs/whats_new/v_160.rst @@ -0,0 +1,94 @@ +Version 1.6.X +============= + +Version 1.6.0 +------------- + +Deployed: 16th March 2023 + +Contributors +~~~~~~~~~~~~ + +- `Gleb Levitski `_ +- `Morgan Sell `_ +- `Alfonso Tobar `_ +- `Nodar Okroshiashvili `_ +- `Luís Seabra `_ +- `Kyle Gilde `_ +- `Soledad Galli `_ + +In this release, we make Feature-engine transformers compatible with the `set_output` +API from Scikit-learn, which was released in version 1.2.0. We also make Feature-engine +compatible with the newest direction of pandas, in removing the `inplace` functionality +that our transformers use under the hood. + +We introduce a major change: most of the **categorical encoders can now encode variables +even if they have missing data**. + +We are also releasing **3 brand new transformers**: One for discretization, one for feature +selection and one for operations between datetime variables. + +We also made a major improvement in the performance of the `DropDuplicateFeatures` and some +smaller bug fixes here and there. + +We'd like to thank all contributors for fixing bugs and expanding the functionality +and documentation of Feature-engine. + +Thank you so much to all contributors and to those of you who created issues flagging bugs or +requesting new functionality. + +New transformers +~~~~~~~~~~~~~~~~ + +- **ProbeFeatureSelection**: introduces random features and selects variables whose importance is greater than the random ones (`Morgan Sell `_ and `Soledad Galli `_) +- **DatetimeSubtraction**: creates new features by subtracting datetime variables (`Kyle Gilde `_ and `Soledad Galli `_) +- **GeometricWidthDiscretiser**: sorts continuous variables into intervals determined by geometric progression (`Gleb Levitski `_) + +New functionality +~~~~~~~~~~~~~~~~~ + +- Allow categorical encoders to encode variables with NaN (`Soledad Galli `_) +- Make transformers compatible with new `set_output` functionality from sklearn (`Soledad Galli `_) +- The `ArbitraryDiscretiser()` now includes the lowest limits in the intervals (`Soledad Galli `_) + +New modules +~~~~~~~~~~~ + +- New **Datasets** module with functions to load specific datasets (`Alfonso Tobar `_) +- New **variable_handling** module with functions to automatically select numerical, categorical, or datetime variables (`Soledad Galli `_) + +Bug fixes +~~~~~~~~~ + +- Fixed bug in `DropFeatures()` (`Luís Seabra `_) +- Fixed bug in `RecursiveFeatureElimination()` caused when only 1 feature remained in data (`Soledad Galli `_) + +Documentation +~~~~~~~~~~~~~ + +- Add example code snippets to the selection module API docs (`Alfonso Tobar `_) +- Add example code snippets to the outlier module API docs (`Alfonso Tobar `_) +- Add example code snippets to the transformation module API docs (`Alfonso Tobar `_) +- Add example code snippets to the time series module API docs (`Alfonso Tobar `_) +- Add example code snippets to the preprocessing module API docs (`Alfonso Tobar `_) +- Add example code snippets to the wrapper module API docs (`Alfonso Tobar `_) +- Updated documentation using new Dataset module (`Alfonso Tobar `_ and `Soledad Galli `_) +- Reorganized Readme badges (`Gleb Levitski `_) +- New Jupyter notebooks for `GeometricWidthDiscretiser` (`Gleb Levitski `_) +- Fixed typos (`Gleb Levitski `_) +- Remove examples using the boston house dataset (`Soledad Galli `_) +- Update sponsor page and contribute page (`Soledad Galli `_) + + +Deprecations +~~~~~~~~~~~~ + +- The class `PRatioEncoder` is no longer supported and was removed from the API (`Soledad Galli `_) + +Code improvements +~~~~~~~~~~~~~~~~~ + +- Massive improvement in the performance (speed) of `DropDuplicateFeatures()` (`Nodar Okroshiashvili `_) +- Remove `inplace` and other issues related to pandas new direction (`Luís Seabra `_) +- Move most docstrings to dedicated docstrings module (`Soledad Galli `_) +- Unnest tests for encoders (`Soledad Galli `_) diff --git a/feature_engine/VERSION b/feature_engine/VERSION index 4cda8f19e..dc1e644a1 100644 --- a/feature_engine/VERSION +++ b/feature_engine/VERSION @@ -1 +1 @@ -1.5.2 +1.6.0 From feddb0628c9e7ffe866418a81c697cecc62c1987 Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Sat, 22 Apr 2023 14:14:52 +0200 Subject: [PATCH 6/6] refactor code to work with pandas 2.0 (#660) * Transform positional argument into keyword argument From pandas 2.0 any only accepts keyworkd arguments ref https://github.com/pandas-dev/pandas/pull/44896 * Change how reciprocal is computed I have not fully understood why this solve the problem, but splitting the operation in 2 lines does not seem to work * Catch warnings from pandas.to_datetime Now pandas.to_datetime raises a warning when the column cannot be converted * check_dtype=False in tests datetime features Pandas dataframes created from python integers are created with int column types `int64` but the operation tested returns `int32` which caused issues * Use droplevel before merging Merging dfs with different column lelvels has been disallowed ref https://github.com/pandas-dev/pandas/issues/34862 * Change expected values for months I am not sure why this caused an issue, maybe due to type casting? * run black * run black on tests * isort _variable_type_checks.py * Fix datetime_subtraction --------- Co-authored-by: Claudio Salvatore Arcidiacono --- .../datetime/datetime_subtraction.py | 2 +- .../imputation/drop_missing_data.py | 2 +- feature_engine/transformation/reciprocal.py | 6 ++--- .../_variable_type_checks.py | 8 +++--- tests/test_datetime/test_datetime_features.py | 26 ++++++++++++++----- .../test_forecasting/test_window_features.py | 4 +++ 6 files changed, 34 insertions(+), 14 deletions(-) diff --git a/feature_engine/datetime/datetime_subtraction.py b/feature_engine/datetime/datetime_subtraction.py index 2a900a280..8fdb83b2e 100644 --- a/feature_engine/datetime/datetime_subtraction.py +++ b/feature_engine/datetime/datetime_subtraction.py @@ -318,7 +318,7 @@ def _sub(self, dt_df: pd.DataFrame): new_df[new_varnames] = ( dt_df[self.variables_] .sub(dt_df[reference], axis=0) - .apply(lambda s: s / np.timedelta64(1, self.output_unit)) + .div(np.timedelta64(1, self.output_unit).astype("timedelta64[ns]")) ) if self.new_variables_names is not None: diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py index cd9e6fe12..c6af28366 100644 --- a/feature_engine/imputation/drop_missing_data.py +++ b/feature_engine/imputation/drop_missing_data.py @@ -205,7 +205,7 @@ def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame: idx = pd.isnull(X[self.variables_]).mean(axis=1) >= self.threshold idx = idx[idx] else: - idx = pd.isnull(X[self.variables_]).any(1) + idx = pd.isnull(X[self.variables_]).any(axis=1) idx = idx[idx] return X.loc[idx.index, :] diff --git a/feature_engine/transformation/reciprocal.py b/feature_engine/transformation/reciprocal.py index 05546952e..66fe7f38b 100644 --- a/feature_engine/transformation/reciprocal.py +++ b/feature_engine/transformation/reciprocal.py @@ -96,7 +96,6 @@ class ReciprocalTransformer(BaseNumericalTransformer): def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None ) -> None: - self.variables = _check_init_parameter_variables(variables) def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): @@ -152,8 +151,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # transform # for some reason reciprocal does not work with integers - X.loc[:, self.variables_] = X.loc[:, self.variables_].astype("float") - X.loc[:, self.variables_] = np.reciprocal(X.loc[:, self.variables_]) + X.loc[:, self.variables_] = np.reciprocal( + X.loc[:, self.variables_].astype("float") + ) return X diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index 4031a0597..fe4f0ac2d 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -1,3 +1,5 @@ +import warnings + import pandas as pd from pandas.core.dtypes.common import is_categorical_dtype as is_categorical from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime @@ -6,7 +8,6 @@ def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: - # check for datetime only if object cannot be cast as numeric because # if it could pd.to_datetime would convert it to datetime regardless if is_object(column): @@ -25,7 +26,9 @@ def _is_categories_num(column: pd.Series) -> bool: def _is_convertible_to_dt(column: pd.Series) -> bool: - return is_datetime(pd.to_datetime(column, errors="ignore", utc=True)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return is_datetime(pd.to_datetime(column, errors="ignore", utc=True)) def _is_convertible_to_num(column: pd.Series) -> bool: @@ -33,7 +36,6 @@ def _is_convertible_to_num(column: pd.Series) -> bool: def _is_categorical_and_is_datetime(column: pd.Series) -> bool: - # check for datetime only if object cannot be cast as numeric because # if it could pd.to_datetime would convert it to datetime regardless if is_object(column): diff --git a/tests/test_datetime/test_datetime_features.py b/tests/test_datetime/test_datetime_features.py index 410e14d68..1727f27b6 100644 --- a/tests/test_datetime/test_datetime_features.py +++ b/tests/test_datetime/test_datetime_features.py @@ -183,6 +183,7 @@ def test_extract_datetime_features_with_default_options( df_datetime_transformed[ vars_non_dt + [var + feat for var in vars_dt for feat in feat_names_default] ], + check_dtype=False, ) @@ -198,6 +199,7 @@ def test_extract_datetime_features_from_specified_variables( + ["datetime_range", "date_obj2", "time_obj"] + ["date_obj1" + feat for feat in feat_names_default] ], + check_dtype=False, ) # multiple datetime variables @@ -215,6 +217,7 @@ def test_extract_datetime_features_from_specified_variables( for feat in feat_names_default ] ], + check_dtype=False, ) # multiple datetime variables in different order than they appear in the df @@ -232,6 +235,7 @@ def test_extract_datetime_features_from_specified_variables( for feat in feat_names_default ] ], + check_dtype=False, ) # datetime variable is index @@ -251,12 +255,15 @@ def test_extract_datetime_features_from_specified_variables( ], axis=1, ), + check_dtype=False, ) def test_extract_all_datetime_features(df_datetime, df_datetime_transformed): X = DatetimeFeatures(features_to_extract="all").fit_transform(df_datetime) - pd.testing.assert_frame_equal(X, df_datetime_transformed.drop(vars_dt, axis=1)) + pd.testing.assert_frame_equal( + X, df_datetime_transformed.drop(vars_dt, axis=1), check_dtype=False + ) def test_extract_specified_datetime_features(df_datetime, df_datetime_transformed): @@ -269,6 +276,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme vars_non_dt + [var + "_" + feat for var in vars_dt for feat in ["semester", "week"]] ], + check_dtype=False, ) # different order than they appear in the glossary @@ -281,6 +289,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme vars_non_dt + [var + "_" + feat for var in vars_dt for feat in ["hour", "day_of_week"]] ], + check_dtype=False, ) @@ -290,7 +299,9 @@ def test_extract_features_from_categorical_variable( cat_date = pd.DataFrame({"date_obj1": df_datetime["date_obj1"].astype("category")}) X = DatetimeFeatures(variables="date_obj1").fit_transform(cat_date) pd.testing.assert_frame_equal( - X, df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]] + X, + df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]], + check_dtype=False, ) @@ -311,6 +322,7 @@ def test_extract_features_from_different_timezones( df_datetime_transformed[["time_obj_hour"]].apply( lambda x: x.subtract(time_zones) ), + check_dtype=False, ) exp_err_msg = ( "ValueError: variable(s) time_obj " @@ -356,7 +368,7 @@ def test_extract_features_from_localized_tz_variables(): # transform X = transformer.transform(tz_df) df_expected = pd.DataFrame({"date_var_hour": [1, 2, 2, 2, 2, 3, 3]}) - pd.testing.assert_frame_equal(X, df_expected) + pd.testing.assert_frame_equal(X, df_expected, check_dtype=False) # when utc is True transformer = DatetimeFeatures(features_to_extract=["hour"], utc=True).fit(tz_df) @@ -372,7 +384,7 @@ def test_extract_features_from_localized_tz_variables(): # transform X = transformer.transform(tz_df) df_expected = pd.DataFrame({"date_var_hour": [5, 6, 6, 6, 6, 7, 7]}) - pd.testing.assert_frame_equal(X, df_expected) + pd.testing.assert_frame_equal(X, df_expected, check_dtype=False) def test_extract_features_without_dropping_original_variables( @@ -399,6 +411,7 @@ def test_extract_features_without_dropping_original_variables( ], axis=1, ), + check_dtype=False, ) @@ -435,6 +448,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime): pd.testing.assert_frame_equal( X, pd.DataFrame({"date_obj2_day_of_month": [10, 31, 30, 17]}), + check_dtype=False, ) X = DatetimeFeatures(features_to_extract=["year"], yearfirst=True).fit_transform( @@ -443,6 +457,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime): pd.testing.assert_frame_equal( X, pd.DataFrame({"date_obj2_year": [2010, 2009, 1995, 2004]}), + check_dtype=False, ) @@ -457,8 +472,7 @@ def test_get_feature_names_out(df_datetime, df_datetime_transformed): transformer.get_feature_names_out(input_features=vars_dt) with pytest.raises(ValueError): - transformer.get_feature_names_out(input_features=["date_obj1"])\ - + transformer.get_feature_names_out(input_features=["date_obj1"]) # default features from 1 variable transformer = DatetimeFeatures(variables="date_obj1") X = transformer.fit_transform(df_datetime) diff --git a/tests/test_time_series/test_forecasting/test_window_features.py b/tests/test_time_series/test_forecasting/test_window_features.py index 344f90e3f..a03259b7e 100644 --- a/tests/test_time_series/test_forecasting/test_window_features.py +++ b/tests/test_time_series/test_forecasting/test_window_features.py @@ -380,8 +380,10 @@ def test_multiple_windows(df_time): X = df_time.copy() num_vars = ["ambient_temp", "module_temp", "irradiation"] tmp = X[num_vars].rolling(2).agg(["sum", "mean"]).shift(periods=15, freq="min") + tmp.columns = tmp.columns.droplevel() X_tr = X.merge(tmp, left_index=True, right_index=True, how="left") tmp = X[num_vars].rolling(3).agg(["sum", "mean"]).shift(periods=15, freq="min") + tmp.columns = tmp.columns.droplevel() X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left") X_tr.columns = transformer.get_feature_names_out() @@ -404,6 +406,7 @@ def test_multiple_windows(df_time): .agg(["sum", "mean"]) .shift(freq="30min") ) + tmp.columns = tmp.columns.droplevel() X_tr = X.merge(tmp, left_index=True, right_index=True, how="left") tmp = ( X[["ambient_temp", "irradiation"]] @@ -411,6 +414,7 @@ def test_multiple_windows(df_time): .agg(["sum", "mean"]) .shift(freq="30min") ) + tmp.columns = tmp.columns.droplevel() X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left") X_tr.columns = transformer.get_feature_names_out()