Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New feature: Lag or windows features grouped by #727

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
4d653a9
add group by variables to base forecast transformer
Ezzaldin97 Feb 23, 2024
4e9d849
add group by variables to lag_features
Ezzaldin97 Feb 23, 2024
7f40391
add group by window features
Ezzaldin97 Feb 25, 2024
b476748
add group by expanding window features
Ezzaldin97 Feb 25, 2024
02c59bd
add test cases of groupby timeseries features
Ezzaldin97 Feb 25, 2024
0dd92cc
ensure code style tests
Ezzaldin97 Feb 25, 2024
47de2d6
fixing typehint errors
Ezzaldin97 Feb 25, 2024
dd43c27
fixing docs indentation issue
Ezzaldin97 Feb 25, 2024
7459811
fixing docs indentation issue in lag_features
Ezzaldin97 Feb 25, 2024
12aa825
adjust formatting and code style in tests
Ezzaldin97 Feb 29, 2024
c3bee66
refactoring timeseries & reformatting the code
Ezzaldin97 Feb 29, 2024
67725dc
adjust code formatting & style in tests
Ezzaldin97 Mar 2, 2024
9cb01ea
fix create lag features using groupby & freq parameters
Ezzaldin97 Mar 2, 2024
72ce43c
adjust code style
Ezzaldin97 Mar 2, 2024
9d999b0
add test cases to ensure code coverage
Ezzaldin97 Mar 2, 2024
b7b8bc9
add group_by docstring to _docstring
Ezzaldin97 Apr 1, 2024
ba375a4
remove check input of group_by
Ezzaldin97 Apr 1, 2024
90f08f4
enhance performance of group_by window features operations
Ezzaldin97 Apr 1, 2024
66baa75
enhance performance of group_by expanding window features operations
Ezzaldin97 Apr 1, 2024
92f996d
fix reindexing to original index after grouping bug
Ezzaldin97 Apr 1, 2024
152c037
fix reindexing to original index after grouping operation bug
Ezzaldin97 Apr 1, 2024
5343e50
replacing group_by docstring with group_by_docstring
Ezzaldin97 Apr 1, 2024
ef1eaa8
adjust code-style and formatting
Ezzaldin97 Apr 1, 2024
09db782
remove white spaces
Ezzaldin97 Apr 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,10 @@
contain missing values. If `'ignore'`, missing data will be ignored when
learning parameters or performing the transformation.
""".rstrip()

_group_by_docstring = """group_by: str, int, or list of strings or integers,default=None
A group_by operation involves some combination of splitting the object,
applying a function, and combining the results.
This can be used to group large amounts of data and
compute operations on these groups.
""".rstrip()
4 changes: 2 additions & 2 deletions feature_engine/selection/drop_psi_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import datetime
from typing import List, Union
from typing import Dict, List, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -475,7 +475,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
threshold_cat = self.threshold

# Compute the PSI by looping over the features
self.psi_values_ = {}
solegalli marked this conversation as resolved.
Show resolved Hide resolved
self.psi_values_: Dict = {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We resolved this in a different PR. Could we remove this change from here please?

self.features_to_drop_ = []

# Compute PSI for numerical features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_drop_original_docstring,
_group_by_docstring,
_missing_values_docstring,
)
from feature_engine._docstrings.methods import _fit_not_learn_docstring
Expand All @@ -37,6 +38,7 @@
feature_names_in_=_feature_names_in_docstring,
fit=_fit_not_learn_docstring,
n_features_in_=_n_features_in_docstring,
group_by=_group_by_docstring,
)
class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin):
"""
Expand All @@ -51,6 +53,8 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu

{drop_original}

{group_by}

Attributes
----------
{feature_names_in_}
Expand All @@ -64,6 +68,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
missing_values: str = "raise",
drop_original: bool = False,
group_by: Union[None, int, str, List[Union[str, int]]] = None,
) -> None:

if missing_values not in ["raise", "ignore"]:
Expand All @@ -81,6 +86,7 @@ def __init__(
self.variables = _check_variables_input_value(variables)
self.missing_values = missing_values
self.drop_original = drop_original
self.group_by = group_by

def _check_index(self, X: pd.DataFrame):
"""
Expand Down
52 changes: 44 additions & 8 deletions feature_engine/timeseries/forecasting/expanding_window_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from __future__ import annotations

from typing import List
from typing import List, Union

import pandas as pd

Expand All @@ -13,6 +13,7 @@
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_drop_original_docstring,
_group_by_docstring,
_missing_values_docstring,
_variables_numerical_docstring,
)
Expand All @@ -34,6 +35,7 @@
n_features_in_=_n_features_in_docstring,
fit=_fit_not_learn_docstring,
fit_transform=_fit_transform_docstring,
group_by=_group_by_docstring,
)
class ExpandingWindowFeatures(BaseForecastTransformer):
"""
Expand Down Expand Up @@ -93,6 +95,8 @@ class ExpandingWindowFeatures(BaseForecastTransformer):

{drop_original}

{group_by}

Attributes
----------
variables_:
Expand Down Expand Up @@ -151,6 +155,7 @@ def __init__(
sort_index: bool = True,
missing_values: str = "raise",
drop_original: bool = False,
group_by: Union[None, int, str, List[Union[str, int]]] = None,
) -> None:

if not isinstance(functions, (str, list)) or not all(
Expand All @@ -168,7 +173,7 @@ def __init__(
f"periods must be a non-negative integer. Got {periods} instead."
)

super().__init__(variables, missing_values, drop_original)
super().__init__(variables, missing_values, drop_original, group_by)

self.min_periods = min_periods
self.functions = functions
Expand All @@ -193,12 +198,21 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
# Common dataframe checks and setting up.
X = self._check_transform_input_and_state(X)

tmp = (
X[self.variables_]
.expanding(min_periods=self.min_periods)
.agg(self.functions)
.shift(periods=self.periods, freq=self.freq)
)
if self.group_by:
original_index = X.index
tmp = X.groupby(self.group_by, as_index=False).apply(
self._agg_expanding_window_features,
include_groups=False,
)
tmp = tmp.set_index(original_index)
tmp = tmp.reindex(original_index)
else:
tmp = (
X[self.variables_]
.expanding(min_periods=self.min_periods)
.agg(self.functions)
.shift(periods=self.periods, freq=self.freq)
)

tmp.columns = self._get_new_features_name()

Expand All @@ -224,3 +238,25 @@ def _get_new_features_name(self) -> List:
]

return feature_names

def _agg_expanding_window_features(
self,
grouped_df: pd.core.groupby.generic.DataFrameGroupBy,
) -> Union[pd.Series, pd.DataFrame]:
"""generate expanding window features based on groups
Parameters
----------
grouped_df : pd.core.groupby.generic.DataFrameGroupBy
dataframe of groups
solegalli marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
Union[pd.Series, pd.DataFrame]
returned expanding window features
"""
return (
grouped_df[self.variables_]
.expanding(min_periods=self.min_periods)
.agg(self.functions)
.shift(periods=self.periods, freq=self.freq)
)
88 changes: 71 additions & 17 deletions feature_engine/timeseries/forecasting/lag_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_drop_original_docstring,
_group_by_docstring,
_missing_values_docstring,
_variables_numerical_docstring,
)
Expand All @@ -32,6 +33,7 @@
n_features_in_=_n_features_in_docstring,
fit=_fit_not_learn_docstring,
fit_transform=_fit_transform_docstring,
group_by=_group_by_docstring,
)
class LagFeatures(BaseForecastTransformer):
"""
Expand Down Expand Up @@ -74,6 +76,8 @@ class LagFeatures(BaseForecastTransformer):

{drop_original}

{group_by}

Attributes
----------
variables_:
Expand Down Expand Up @@ -127,6 +131,7 @@ def __init__(
sort_index: bool = True,
missing_values: str = "raise",
drop_original: bool = False,
group_by: Union[None, int, str, List[Union[str, int]]] = None,
) -> None:

if not (
Expand All @@ -151,7 +156,7 @@ def __init__(
"sort_index takes values True and False." f"Got {sort_index} instead."
)

super().__init__(variables, missing_values, drop_original)
super().__init__(variables, missing_values, drop_original, group_by)

self.periods = periods
self.freq = freq
Expand Down Expand Up @@ -180,35 +185,57 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
if isinstance(self.freq, list):
df_ls = []
for fr in self.freq:
tmp = X[self.variables_].shift(
freq=fr,
axis=0,
)
if self.group_by:
tmp = self._agg_freq_lags(
grouped_df=X.groupby(self.group_by),
freq=fr,
)
else:
tmp = X[self.variables_].shift(
freq=fr,
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)

else:
tmp = X[self.variables_].shift(
freq=self.freq,
axis=0,
)
if self.group_by:
tmp = self._agg_freq_lags(
grouped_df=X.groupby(self.group_by),
freq=self.freq,
)
else:
tmp = X[self.variables_].shift(
freq=self.freq,
axis=0,
)

else:
if isinstance(self.periods, list):
df_ls = []
for pr in self.periods:
tmp = X[self.variables_].shift(
periods=pr,
axis=0,
)
if self.group_by:
tmp = X.groupby(self.group_by)[self.variables_].shift(
periods=pr,
)
else:
tmp = X[self.variables_].shift(
periods=pr,
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)

else:
tmp = X[self.variables_].shift(
periods=self.periods,
axis=0,
)
if self.group_by:
tmp = X.groupby(self.group_by)[self.variables_].shift(
periods=self.periods,
)
else:
tmp = X[self.variables_].shift(
periods=self.periods,
axis=0,
)

tmp.columns = self._get_new_features_name()

Expand Down Expand Up @@ -243,3 +270,30 @@ def _get_new_features_name(self) -> List:
]

return feature_names

def _agg_freq_lags(
self,
grouped_df: pd.core.groupby.generic.DataFrameGroupBy,
freq: Union[str, List[str]],
) -> Union[pd.Series, pd.DataFrame]:
"""_summary_

Parameters
----------
grouped_df : pd.core.groupby.generic.DataFrameGroupBy
dataframe of groups
freq : Union[str, List[str]]
Offset to use from the tseries module or time rule.

Returns
-------
Union[pd.Series, pd.DataFrame]
lag feature or dataframe of lag features
"""
tmp_data = []
for _, group in grouped_df:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to loop over the groups to apply the lags? pandas does the lags per group automatically.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried many approaches to simplify this approach, but it is only working when using periods argument with shift() method like the line in 231
, however when using freq argument with shift() method it doesn't work, so I used loop to make it work.
kindly advice if we can simplify it.

original_idx = group.index
tmp = group[self.variables_].shift(freq=freq).reindex(original_idx)
tmp_data.append(tmp)
tmp = pd.concat(tmp_data).sort_index()
return tmp
Loading