From 929ac9b08736a80cd451f7748de64603838b839d Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 20 Jul 2023 16:42:31 -0700 Subject: [PATCH 01/47] initial commit --- .../preprocessing/stl_decomposer.py | 143 ++++++++++++++---- 1 file changed, 110 insertions(+), 33 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index c11dfca1a3..87bb138a3f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -3,6 +3,7 @@ import logging +import matplotlib.pyplot as plt import pandas as pd from pandas import RangeIndex from statsmodels.tsa.arima.model import ARIMA @@ -20,6 +21,7 @@ class STLDecomposer(Decomposer): Args: time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. + series_index(str): Specifies the name of the column in X that provides the series_id objects for multiseries. Defaults to None. degree (int): Not currently used. STL 3x "degree-like" values. None are able to be set at this time. Defaults to 1. period (int): The number of entries in the time series data that corresponds to one period of a @@ -40,14 +42,17 @@ class STLDecomposer(Decomposer): def __init__( self, time_index: str = None, + series_index: str = None, degree: int = 1, # Currently unused. period: int = None, + periods: int = None, seasonal_smoother: int = 7, random_seed: int = 0, **kwargs, ): self.logger = logging.getLogger(__name__) - + self.series_index = series_index + self.periods = [] # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -158,35 +163,51 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: ValueError: If y is None. ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ - self.original_index = y.index if y is not None else None - X, y = self._check_target(X, y) - self._map_dt_to_integer(self.original_index, y.index) - # Warn for poor decomposition use with higher seasonal smoothers if self.seasonal_smoother > 14: self.logger.warning( f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) - # Save the frequency of the fitted series for checking against transform data. - self.frequency = y.index.freqstr or pd.infer_freq(y.index) - - # Determine the period of the seasonal component - if self.period is None: - self.set_period(X, y) - - stl = STL(y, seasonal=self.seasonal_smoother, period=self.period) - res = stl.fit() - self.seasonal = res.seasonal - self.period = stl.period - dist = len(y) % self.period - self.seasonality = ( - self.seasonal[-(dist + self.period) : -dist] - if dist > 0 - else self.seasonal[-self.period :] - ) - self.trend = res.trend - self.residual = res.resid + # If there is not a series_index, add a new series_id column ranging from 0 to the size of the data frame + if self.series_index is None: + X.insert(0, "series_id", range(len(X))) + + # group the data by series_id + grouped_X = X.groupby(X[self.series_index]) + # iterate through each id group + self.seasonality = [] + self.trend = [] + self.residual = [] + for series_id, series_X in grouped_X: + series_y = y.reindex(series_X.index) + self.original_index = series_y.index if series_y is not None else None + + series_X, series_y = self._check_target(series_X, series_y) + self._map_dt_to_integer(self.original_index, series_y.index) + + # Save the frequency of the fitted series for checking against transform data. + self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) + + # Determine the period of the seasonal component + if self.period is None: + self.set_period(series_X, series_y) + + stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) + res = stl.fit() + self.seasonal = res.seasonal + self.period = stl.period + + self.periods.append(self.period) + + dist = len(series_y) % self.period + self.seasonality.append( + self.seasonal[-(dist + self.period) : -dist] + if dist > 0 + else self.seasonal[-self.period :], + ) + self.trend.append(res.trend) + self.residual.append(res.resid) return self @@ -359,22 +380,22 @@ def get_trend_dataframe(self, X, y): def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(self.trend.index) and all( - y.index == self.trend.index, + if len(y.index) == len(self.trend[self.series_id].index) and all( + y.index == self.trend[self.series_id].index, ): - trend = self.trend - seasonal = self.seasonal - residual = self.residual + trend = self.trend[self.series_id] + seasonal = self.seasonal[self.series_id] + residual = self.residual[self.series_id] else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, - period=self.period, + period=self.periods[self.series_id], ) decomposer.fit(X, y) - trend = decomposer.trend - seasonal = decomposer.seasonal - residual = decomposer.residual + trend = decomposer.trend[self.series_id] + seasonal = decomposer.seasonal[self.series_id] + residual = decomposer.residual[self.series_id] return pd.DataFrame( { "signal": y, @@ -432,3 +453,59 @@ def get_trend_prediction_intervals(self, y, coverage=None): prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] return prediction_interval_result + + # Overload the plot_decomposition fucntion to be able to plot multiple decompositions for multiseries + def plot_decomposition( + self, + X: pd.DataFrame, + y: pd.Series, + show: bool = False, + ) -> list[tuple[plt.Figure, list]]: + """Plots the decomposition of the target signal. + + Args: + X (pd.DataFrame): Input data with time series data in index. + y (pd.Series or pd.DataFrame): Target variable data provided as a Series for univariate problems or + a DataFrame for multivariate problems. + show (bool): Whether to display the plot or not. Defaults to False. + + Returns: + matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions + plotted on them + + """ + # if self.series_index is None: + # X.insert(0, 'series_id', range(len(X))) + + # group the data by series_id + grouped_X = X.groupby(X[self.series_index]) + # iterate through each id group + plot_info = [] + # for series_id, series_X in grouped_X: + for s_index, (series_id, series_X) in enumerate(grouped_X): + print("Index: " + str(s_index)) + + self.series_id = s_index + # series_y = y.reindex(series_X.index) + series_y = y[series_X.index] + + print(series_y) + decomposition_results = self.get_trend_dataframe(series_X, series_y) + fig, axs = plt.subplots(4) + fig.set_size_inches(18.5, 14.5) + axs[0].plot(decomposition_results[0]["signal"], "r") + axs[0].set_title("signal") + axs[1].plot(decomposition_results[0]["trend"], "b") + axs[1].set_title("trend") + axs[2].plot(decomposition_results[0]["seasonality"], "g") + axs[2].set_title("seasonality") + axs[3].plot(decomposition_results[0]["residual"], "y") + axs[3].set_title("residual") + + fig.suptitle("Decomposition for Series {}".format(series_id)) + + plot_info.append((fig, axs)) + plt.show(block=False) + # if show: # pragma: no cover + # plt.show(block=False) + return plot_info From fedca597d3a72e4405dac59cb78d17cf14621e5e Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 21 Jul 2023 10:18:10 -0700 Subject: [PATCH 02/47] creates multiple graphs --- .../components/transformers/preprocessing/stl_decomposer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 87bb138a3f..f77cc879e6 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -490,6 +490,9 @@ def plot_decomposition( series_y = y[series_X.index] print(series_y) + # will need to change later since 'freq' var needs to be mutable + series_X.index = pd.DatetimeIndex(series_X["time_index"], freq="W-FRI") + decomposition_results = self.get_trend_dataframe(series_X, series_y) fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) From a0e4a39784a739b7620a863f89debe2b22a5b237 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 21 Jul 2023 15:53:08 -0700 Subject: [PATCH 03/47] able to graph decomp --- .../preprocessing/stl_decomposer.py | 100 ++++++++++-------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index f77cc879e6..d7a0d419c8 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -45,14 +45,12 @@ def __init__( series_index: str = None, degree: int = 1, # Currently unused. period: int = None, - periods: int = None, seasonal_smoother: int = 7, random_seed: int = 0, **kwargs, ): self.logger = logging.getLogger(__name__) self.series_index = series_index - self.periods = [] # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -86,7 +84,7 @@ def _project_trend(self, y): units_forward = ( len( pd.date_range( - start=self.trend.index[-1], + start=self.trend[self.group_index].index[-1], end=y.index[-1], freq=self.frequency, ), @@ -98,18 +96,18 @@ def _project_trend(self, y): # Model the trend and project it forward stlf = STLForecast( - self.trend, + self.trend[self.group_index], ARIMA, model_kwargs=dict(order=(1, 1, 0), trend="t"), - period=self.period, + period=self.periods[self.group_index], ) stlf = stlf.fit() forecast = stlf.forecast(units_forward) # Store forecast summary for use in calculating trend prediction intervals. self.forecast_summary = stlf.get_prediction( - len(self.trend), - len(self.trend) + units_forward - 1, + len(self.trend[self.group_index]), + len(self.trend[self.group_index]) + units_forward - 1, ) # Handle out-of-sample forecasts. The forecast will have additional data @@ -132,8 +130,8 @@ def _project_trend_and_seasonality(self, y): projected_seasonality = self._project_seasonal( y, - self.seasonality, - self.period, + self.seasonality[self.group_index], + self.periods[self.group_index], self.frequency, ) return projected_trend, projected_seasonality @@ -174,13 +172,15 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: X.insert(0, "series_id", range(len(X))) # group the data by series_id - grouped_X = X.groupby(X[self.series_index]) + grouped_X = X.groupby(self.series_index) # iterate through each id group + self.seasonal = [] self.seasonality = [] self.trend = [] self.residual = [] + self.periods = [] for series_id, series_X in grouped_X: - series_y = y.reindex(series_X.index) + series_y = y[series_X.index] self.original_index = series_y.index if series_y is not None else None series_X, series_y = self._check_target(series_X, series_y) @@ -188,23 +188,25 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: # Save the frequency of the fitted series for checking against transform data. self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) - # Determine the period of the seasonal component if self.period is None: self.set_period(series_X, series_y) stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() - self.seasonal = res.seasonal + seasonal = res.seasonal + self.seasonal.append(seasonal) self.period = stl.period self.periods.append(self.period) dist = len(series_y) % self.period self.seasonality.append( - self.seasonal[-(dist + self.period) : -dist] - if dist > 0 - else self.seasonal[-self.period :], + ( + seasonal[-(dist + self.period) : -dist] + if dist > 0 + else seasonal[-self.period :], + ), ) self.trend.append(res.trend) self.residual.append(res.resid) @@ -243,11 +245,15 @@ def transform( y_out_of_sample = pd.Series([]) # For partially and wholly in-sample data, retrieve stored results. - if self.trend.index[0] <= y.index[0] <= self.trend.index[-1]: - y_in_sample = self.residual[y.index[0] : y.index[-1]] + if ( + self.trend[self.group_index].index[0] + <= y.index[0] + <= self.trend[self.group_index].index[-1] + ): + y_in_sample = self.residual[self.group_index][y.index[0] : y.index[-1]] # For out of sample data.... - if y.index[-1] > self.trend.index[-1]: + if y.index[-1] > self.trend[self.group_index].index[-1]: try: # ...that is partially out of sample and partially in sample. truncated_y = y[y.index.get_loc(self.trend.index[-1]) + 1 :] @@ -307,14 +313,18 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: else y_t.index[-1] + 1 * y_t.index.freq ) trend = ( - self.trend.reset_index(drop=True)[left_index:right_index] + self.trend[self.group_index].reset_index(drop=True)[ + left_index:right_index + ] if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.trend[left_index:right_index] + else self.trend[self.group_index][left_index:right_index] ) seasonal = ( - self.seasonal.reset_index(drop=True)[left_index:right_index] + self.seasonal[self.group_index][self.group_index].reset_index( + drop=True, + )[left_index:right_index] if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.seasonal[left_index:right_index] + else self.seasonal[self.group_index][left_index:right_index] ) y_in_sample = y_t + trend + seasonal y_in_sample = y_in_sample.dropna() @@ -380,22 +390,22 @@ def get_trend_dataframe(self, X, y): def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(self.trend[self.series_id].index) and all( - y.index == self.trend[self.series_id].index, + if len(y.index) == len(self.trend[self.group_index].index) and all( + y.index == self.trend[self.group_index].index, ): - trend = self.trend[self.series_id] - seasonal = self.seasonal[self.series_id] - residual = self.residual[self.series_id] + trend = self.trend[self.group_index] + seasonal = self.seasonal[self.group_index] + residual = self.residual[self.group_index] else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, - period=self.periods[self.series_id], + period=self.period, ) decomposer.fit(X, y) - trend = decomposer.trend[self.series_id] - seasonal = decomposer.seasonal[self.series_id] - residual = decomposer.residual[self.series_id] + trend = decomposer.trend[self.group_index] + seasonal = decomposer.seasonal[self.group_index] + residual = decomposer.residual[self.group_index] return pd.DataFrame( { "signal": y, @@ -474,26 +484,27 @@ def plot_decomposition( plotted on them """ - # if self.series_index is None: - # X.insert(0, 'series_id', range(len(X))) + if self.series_index is None: + X.insert(0, "series_id", range(len(X))) # group the data by series_id - grouped_X = X.groupby(X[self.series_index]) + grouped_X = X.groupby(self.series_index) # iterate through each id group plot_info = [] # for series_id, series_X in grouped_X: - for s_index, (series_id, series_X) in enumerate(grouped_X): - print("Index: " + str(s_index)) - - self.series_id = s_index - # series_y = y.reindex(series_X.index) + for group_index, (series_id, series_X) in enumerate(grouped_X): + self.group_index = group_index series_y = y[series_X.index] - - print(series_y) + print("Index: " + str(group_index)) # will need to change later since 'freq' var needs to be mutable series_X.index = pd.DatetimeIndex(series_X["time_index"], freq="W-FRI") decomposition_results = self.get_trend_dataframe(series_X, series_y) + print("Seasonality: ") + print(self.seasonality) + print("\nTrend: ") + print(self.trend) + fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) axs[0].plot(decomposition_results[0]["signal"], "r") @@ -508,7 +519,6 @@ def plot_decomposition( fig.suptitle("Decomposition for Series {}".format(series_id)) plot_info.append((fig, axs)) - plt.show(block=False) - # if show: # pragma: no cover - # plt.show(block=False) + if show: # pragma: no cover + plt.show() return plot_info From 88f7d6735a77e8b1b5caaa6ea6b217a448b43679 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 24 Jul 2023 09:01:17 -0700 Subject: [PATCH 04/47] graph individually --- .../preprocessing/stl_decomposer.py | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index d7a0d419c8..a33cbc4fef 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -84,7 +84,7 @@ def _project_trend(self, y): units_forward = ( len( pd.date_range( - start=self.trend[self.group_index].index[-1], + start=self.trend.index[-1], end=y.index[-1], freq=self.frequency, ), @@ -96,18 +96,18 @@ def _project_trend(self, y): # Model the trend and project it forward stlf = STLForecast( - self.trend[self.group_index], + self.trend, ARIMA, model_kwargs=dict(order=(1, 1, 0), trend="t"), - period=self.periods[self.group_index], + period=self.period, ) stlf = stlf.fit() forecast = stlf.forecast(units_forward) # Store forecast summary for use in calculating trend prediction intervals. self.forecast_summary = stlf.get_prediction( - len(self.trend[self.group_index]), - len(self.trend[self.group_index]) + units_forward - 1, + len(self.trend), + len(self.trend) + units_forward - 1, ) # Handle out-of-sample forecasts. The forecast will have additional data @@ -130,8 +130,8 @@ def _project_trend_and_seasonality(self, y): projected_seasonality = self._project_seasonal( y, - self.seasonality[self.group_index], - self.periods[self.group_index], + self.seasonality, + self.period, self.frequency, ) return projected_trend, projected_seasonality @@ -174,13 +174,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: # group the data by series_id grouped_X = X.groupby(self.series_index) # iterate through each id group - self.seasonal = [] - self.seasonality = [] - self.trend = [] - self.residual = [] + self.seasonals = [] + self.seasonalities = [] + self.trends = [] + self.residuals = [] self.periods = [] for series_id, series_X in grouped_X: - series_y = y[series_X.index] + series_y = y[series_X.index].copy() self.original_index = series_y.index if series_y is not None else None series_X, series_y = self._check_target(series_X, series_y) @@ -195,21 +195,21 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() seasonal = res.seasonal - self.seasonal.append(seasonal) + self.seasonals.append(seasonal) self.period = stl.period self.periods.append(self.period) dist = len(series_y) % self.period - self.seasonality.append( + self.seasonalities.append( ( seasonal[-(dist + self.period) : -dist] if dist > 0 else seasonal[-self.period :], ), ) - self.trend.append(res.trend) - self.residual.append(res.resid) + self.trends.append(res.trend) + self.residuals.append(res.resid) return self @@ -245,15 +245,11 @@ def transform( y_out_of_sample = pd.Series([]) # For partially and wholly in-sample data, retrieve stored results. - if ( - self.trend[self.group_index].index[0] - <= y.index[0] - <= self.trend[self.group_index].index[-1] - ): - y_in_sample = self.residual[self.group_index][y.index[0] : y.index[-1]] + if self.trend.index[0] <= y.index[0] <= self.trend.index[-1]: + y_in_sample = self.residual[y.index[0] : y.index[-1]] # For out of sample data.... - if y.index[-1] > self.trend[self.group_index].index[-1]: + if y.index[-1] > self.trend.index[-1]: try: # ...that is partially out of sample and partially in sample. truncated_y = y[y.index.get_loc(self.trend.index[-1]) + 1 :] @@ -313,18 +309,16 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: else y_t.index[-1] + 1 * y_t.index.freq ) trend = ( - self.trend[self.group_index].reset_index(drop=True)[ - left_index:right_index - ] + self.trend.reset_index(drop=True)[left_index:right_index] if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.trend[self.group_index][left_index:right_index] + else self.trend[left_index:right_index] ) seasonal = ( - self.seasonal[self.group_index][self.group_index].reset_index( + self.seasonal.reset_index( drop=True, )[left_index:right_index] if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.seasonal[self.group_index][left_index:right_index] + else self.seasonal[left_index:right_index] ) y_in_sample = y_t + trend + seasonal y_in_sample = y_in_sample.dropna() @@ -390,12 +384,12 @@ def get_trend_dataframe(self, X, y): def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(self.trend[self.group_index].index) and all( - y.index == self.trend[self.group_index].index, + if len(y.index) == len(self.trend.index) and all( + y.index == self.trend.index, ): - trend = self.trend[self.group_index] - seasonal = self.seasonal[self.group_index] - residual = self.residual[self.group_index] + trend = self.trend + seasonal = self.seasonal + residual = self.residual else: # TODO: Do a better job cloning. decomposer = STLDecomposer( @@ -403,9 +397,9 @@ def _decompose_target(X, y, fh): period=self.period, ) decomposer.fit(X, y) - trend = decomposer.trend[self.group_index] - seasonal = decomposer.seasonal[self.group_index] - residual = decomposer.residual[self.group_index] + trend = decomposer.trend + seasonal = decomposer.seasonal + residual = decomposer.residual return pd.DataFrame( { "signal": y, @@ -484,6 +478,7 @@ def plot_decomposition( plotted on them """ + # If there is not a series_index, add a new series_id column ranging from 0 to the size of the data frame if self.series_index is None: X.insert(0, "series_id", range(len(X))) @@ -493,17 +488,21 @@ def plot_decomposition( plot_info = [] # for series_id, series_X in grouped_X: for group_index, (series_id, series_X) in enumerate(grouped_X): - self.group_index = group_index - series_y = y[series_X.index] - print("Index: " + str(group_index)) + self.trend = self.trends[group_index] + self.seasonality = self.seasonalities[group_index] + self.seasonal = self.seasonals[group_index] + self.residual = self.residuals[group_index] + self.period = self.periods[group_index] + + series_y = y[series_X.index].copy() # will need to change later since 'freq' var needs to be mutable series_X.index = pd.DatetimeIndex(series_X["time_index"], freq="W-FRI") decomposition_results = self.get_trend_dataframe(series_X, series_y) - print("Seasonality: ") - print(self.seasonality) - print("\nTrend: ") - print(self.trend) + # print("Seasonality: ") + # print(self.seasonality[group_index]) + # print("\nTrend: ") + # print(self.trend[group_index]) fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) @@ -519,6 +518,7 @@ def plot_decomposition( fig.suptitle("Decomposition for Series {}".format(series_id)) plot_info.append((fig, axs)) + if show: # pragma: no cover plt.show() return plot_info From 9f4a0d55fddf58d442b1ccbc1e5d7bbb6e897391 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 24 Jul 2023 17:20:03 -0700 Subject: [PATCH 05/47] clean up --- .../preprocessing/stl_decomposer.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index a33cbc4fef..63b5d9153d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -178,12 +178,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: self.seasonalities = [] self.trends = [] self.residuals = [] - self.periods = [] + for series_id, series_X in grouped_X: - series_y = y[series_X.index].copy() + series_y = y[series_X.index] self.original_index = series_y.index if series_y is not None else None series_X, series_y = self._check_target(series_X, series_y) + self._map_dt_to_integer(self.original_index, series_y.index) # Save the frequency of the fitted series for checking against transform data. @@ -194,23 +195,19 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() - seasonal = res.seasonal - self.seasonals.append(seasonal) + self.seasonals.append(res.seasonal) self.period = stl.period - self.periods.append(self.period) - dist = len(series_y) % self.period self.seasonalities.append( ( - seasonal[-(dist + self.period) : -dist] + res.seasonal[-(dist + self.period) : -dist] if dist > 0 - else seasonal[-self.period :], + else res.seasonal[-self.period :], ), ) self.trends.append(res.trend) self.residuals.append(res.resid) - return self def transform( @@ -492,17 +489,19 @@ def plot_decomposition( self.seasonality = self.seasonalities[group_index] self.seasonal = self.seasonals[group_index] self.residual = self.residuals[group_index] - self.period = self.periods[group_index] - series_y = y[series_X.index].copy() + # print("Seasonality: ") + # print(self.seasonality) + # print("Trend: ") + # print(self.trend) + # print("Residual: ") + # print(self.residual) + + series_y = y[series_X.index] # will need to change later since 'freq' var needs to be mutable series_X.index = pd.DatetimeIndex(series_X["time_index"], freq="W-FRI") decomposition_results = self.get_trend_dataframe(series_X, series_y) - # print("Seasonality: ") - # print(self.seasonality[group_index]) - # print("\nTrend: ") - # print(self.trend[group_index]) fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) From 9f82dd3196877b672fc6931f2c4012d5fa18b5dd Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 25 Jul 2023 15:33:18 -0700 Subject: [PATCH 06/47] set period and freq --- .../preprocessing/stl_decomposer.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 63b5d9153d..ae54dc088f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -190,8 +190,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: # Save the frequency of the fitted series for checking against transform data. self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) # Determine the period of the seasonal component - if self.period is None: - self.set_period(series_X, series_y) + self.set_period(series_X, series_y) stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() @@ -481,25 +480,22 @@ def plot_decomposition( # group the data by series_id grouped_X = X.groupby(self.series_index) - # iterate through each id group + + # Iterate through each series id plot_info = [] - # for series_id, series_X in grouped_X: for group_index, (series_id, series_X) in enumerate(grouped_X): self.trend = self.trends[group_index] self.seasonality = self.seasonalities[group_index] self.seasonal = self.seasonals[group_index] self.residual = self.residuals[group_index] - # print("Seasonality: ") - # print(self.seasonality) - # print("Trend: ") - # print(self.trend) - # print("Residual: ") - # print(self.residual) - series_y = y[series_X.index] + # will need to change later since 'freq' var needs to be mutable - series_X.index = pd.DatetimeIndex(series_X["time_index"], freq="W-FRI") + series_X.index = pd.DatetimeIndex( + series_X[self.time_index], + freq=self.frequency, + ) decomposition_results = self.get_trend_dataframe(series_X, series_y) From 7d1a204d6eb4176d42e00d670095785d1fe59c30 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 25 Jul 2023 18:41:56 -0700 Subject: [PATCH 07/47] modify transformer and groups --- .../preprocessing/stl_decomposer.py | 106 +++++++++++------- 1 file changed, 64 insertions(+), 42 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index ae54dc088f..1c1f9befcf 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -167,9 +167,12 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) - # If there is not a series_index, add a new series_id column ranging from 0 to the size of the data frame - if self.series_index is None: - X.insert(0, "series_id", range(len(X))) + # If there is not a series_index, give them one series id with the value 0 + if "series_index" not in X.columns or self.series_index is None: + self.series_index = "series_index" + # X.insert(0, self.series_index, 0) + X[self.series_index] = 0 + self.update_parameters({"series_index": self.series_index}) # group the data by series_id grouped_X = X.groupby(self.series_index) @@ -180,6 +183,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: self.residuals = [] for series_id, series_X in grouped_X: + series_y = y[series_X.index] self.original_index = series_y.index if series_y is not None else None @@ -213,7 +217,7 @@ def transform( self, X: pd.DataFrame, y: pd.Series = None, - ) -> tuple[pd.DataFrame, pd.Series]: + ) -> list(tuple[pd.DataFrame, pd.Series]): """Transforms the target data by removing the STL trend and seasonality. Uses an ARIMA model to project forward the addititve trend and removes it. Then, utilizes the first period's @@ -225,48 +229,68 @@ def transform( y (pd.Series): Target variable to detrend and deseasonalize. Returns: - tuple of pd.DataFrame, pd.Series: The input features are returned without modification. The target + list of tuple of pd.DataFrame, pd.Series: The list of input features are returned without modification. The target variable y is detrended and deseasonalized. Raises: ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ - if y is None: - return X, y - original_index = y.index - X, y = self._check_target(X, y) - self._check_oos_past(y) - - y_in_sample = pd.Series([]) - y_out_of_sample = pd.Series([]) + # If there is not a series_index, give them one series id with the value 0 + # if self.series_index is None: + # self.series_index = "series_index" + # X.insert(0, self.series_index, 0) + # self.update_parameters({"series_index": self.series_index}) - # For partially and wholly in-sample data, retrieve stored results. - if self.trend.index[0] <= y.index[0] <= self.trend.index[-1]: - y_in_sample = self.residual[y.index[0] : y.index[-1]] + # group the data by series_id + grouped_X = X.groupby(self.series_index) - # For out of sample data.... - if y.index[-1] > self.trend.index[-1]: - try: - # ...that is partially out of sample and partially in sample. - truncated_y = y[y.index.get_loc(self.trend.index[-1]) + 1 :] - except KeyError: - # ...that is entirely out of sample. - truncated_y = y + features = [] + for group_index, (series_id, series_X) in enumerate(grouped_X): + self.trend = self.trends[group_index] + self.seasonality = self.seasonalities[group_index] + self.seasonal = self.seasonals[group_index] + self.residual = self.residuals[group_index] - ( - projected_trend, - projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y) + series_y = y[series_X.index] + if series_y is None: + return series_X, series_y + original_index = series_y.index + series_X, series_y = self._check_target(series_X, series_y) + self._check_oos_past(series_y) + + y_in_sample = pd.Series([]) + y_out_of_sample = pd.Series([]) + + # For partially and wholly in-sample data, retrieve stored results. + if self.trend.index[0] <= series_y.index[0] <= self.trend.index[-1]: + y_in_sample = self.residual[series_y.index[0] : series_y.index[-1]] + + # For out of sample data.... + if series_y.index[-1] > self.trend.index[-1]: + try: + # ...that is partially out of sample and partially in sample. + truncated_y = series_y[ + series_y.index.get_loc(self.trend.index[-1]) + 1 : + ] + except KeyError: + # ...that is entirely out of sample. + truncated_y = series_y - y_out_of_sample = infer_feature_types( - pd.Series( - truncated_y - projected_trend - projected_seasonality, - index=truncated_y.index, - ), - ) - y_t = y_in_sample.append(y_out_of_sample) - y_t.index = original_index - return X, y_t + ( + projected_trend, + projected_seasonality, + ) = self._project_trend_and_seasonality(truncated_y) + + y_out_of_sample = infer_feature_types( + pd.Series( + truncated_y - projected_trend - projected_seasonality, + index=truncated_y.index, + ), + ) + y_t = y_in_sample.append(y_out_of_sample) + y_t.index = original_index + features.append((series_X, y_t)) + return features def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: """Adds back fitted trend and seasonality to target variable. @@ -470,15 +494,13 @@ def plot_decomposition( show (bool): Whether to display the plot or not. Defaults to False. Returns: - matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions + list[matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]]: A list of the figure and axes that have the decompositions plotted on them """ - # If there is not a series_index, add a new series_id column ranging from 0 to the size of the data frame - if self.series_index is None: - X.insert(0, "series_id", range(len(X))) + # If there is not a series_index, give them one series id with the value 0 - # group the data by series_id + # Group the data by series_id grouped_X = X.groupby(self.series_index) # Iterate through each series id From d87f0070781f4bfacad6785179b72b82a412e7c2 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 26 Jul 2023 18:07:52 -0700 Subject: [PATCH 08/47] use dictionary instead of list --- .../preprocessing/stl_decomposer.py | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 1c1f9befcf..ee8ec86c95 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -42,7 +42,7 @@ class STLDecomposer(Decomposer): def __init__( self, time_index: str = None, - series_index: str = None, + series_id: str = None, degree: int = 1, # Currently unused. period: int = None, seasonal_smoother: int = 7, @@ -50,7 +50,7 @@ def __init__( **kwargs, ): self.logger = logging.getLogger(__name__) - self.series_index = series_index + self.series_id = series_id # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -168,22 +168,16 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: ) # If there is not a series_index, give them one series id with the value 0 - if "series_index" not in X.columns or self.series_index is None: - self.series_index = "series_index" - # X.insert(0, self.series_index, 0) - X[self.series_index] = 0 - self.update_parameters({"series_index": self.series_index}) + if self.series_id is None: + self.series_id = "series_id" + X[self.series_id] = 0 + self.update_parameters({"series_id": self.series_id}) # group the data by series_id - grouped_X = X.groupby(self.series_index) + grouped_X = X.groupby(self.series_id) # iterate through each id group - self.seasonals = [] - self.seasonalities = [] - self.trends = [] - self.residuals = [] - - for series_id, series_X in grouped_X: - + self.decompositions = {} + for id, series_X in grouped_X: series_y = y[series_X.index] self.original_index = series_y.index if series_y is not None else None @@ -198,19 +192,26 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() - self.seasonals.append(res.seasonal) - self.period = stl.period + + period = stl.period dist = len(series_y) % self.period - self.seasonalities.append( + seasonality = ( ( - res.seasonal[-(dist + self.period) : -dist] + res.seasonal[-(dist + period) : -dist] if dist > 0 - else res.seasonal[-self.period :], + else res.seasonal[-period:], ), ) - self.trends.append(res.trend) - self.residuals.append(res.resid) + + self.decompositions[id] = { + "seasonal": res.seasonal, + "seasonality": seasonality, + "trend": res.trend, + "residual": res.resid, + "period": period, + } + return self def transform( @@ -245,11 +246,12 @@ def transform( grouped_X = X.groupby(self.series_index) features = [] - for group_index, (series_id, series_X) in enumerate(grouped_X): - self.trend = self.trends[group_index] - self.seasonality = self.seasonalities[group_index] - self.seasonal = self.seasonals[group_index] - self.residual = self.residuals[group_index] + for id, series_X in grouped_X: + self.trend = self.decompositions[id]["trend"] + self.seasonality = self.decompositions[id]["seasonality"] + self.seasonal = self.decompositions[id]["seasonal"] + self.residual = self.decompositions[id]["residual"] + self.period = self.decompositions[id]["period"] series_y = y[series_X.index] if series_y is None: @@ -494,22 +496,21 @@ def plot_decomposition( show (bool): Whether to display the plot or not. Defaults to False. Returns: - list[matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]]: A list of the figure and axes that have the decompositions + matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions plotted on them """ - # If there is not a series_index, give them one series id with the value 0 - - # Group the data by series_id - grouped_X = X.groupby(self.series_index) + # group the data by series_id + grouped_X = X.groupby(self.series_id) # Iterate through each series id plot_info = [] - for group_index, (series_id, series_X) in enumerate(grouped_X): - self.trend = self.trends[group_index] - self.seasonality = self.seasonalities[group_index] - self.seasonal = self.seasonals[group_index] - self.residual = self.residuals[group_index] + for id, series_X in grouped_X: + self.trend = self.decompositions[id]["trend"] + self.seasonality = self.decompositions[id]["seasonality"] + self.seasonal = self.decompositions[id]["seasonal"] + self.residual = self.decompositions[id]["residual"] + self.period = self.decompositions[id]["period"] series_y = y[series_X.index] @@ -532,7 +533,7 @@ def plot_decomposition( axs[3].plot(decomposition_results[0]["residual"], "y") axs[3].set_title("residual") - fig.suptitle("Decomposition for Series {}".format(series_id)) + fig.suptitle("Decomposition for Series {}".format(id)) plot_info.append((fig, axs)) From c43b8609d5ef83636f1a583a4a6a70802fbbf84a Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 28 Jul 2023 13:59:38 -0700 Subject: [PATCH 09/47] pass components test and fix ww --- .../preprocessing/stl_decomposer.py | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index ee8ec86c95..0afbe3efd5 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -21,7 +21,7 @@ class STLDecomposer(Decomposer): Args: time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. - series_index(str): Specifies the name of the column in X that provides the series_id objects for multiseries. Defaults to None. + series_id (str): Specifies the name of the column in X that provides the series_id objects for multiseries. Defaults to None. degree (int): Not currently used. STL 3x "degree-like" values. None are able to be set at this time. Defaults to 1. period (int): The number of entries in the time series data that corresponds to one period of a @@ -47,10 +47,12 @@ def __init__( period: int = None, seasonal_smoother: int = 7, random_seed: int = 0, + is_multiseries: bool = False, **kwargs, ): self.logger = logging.getLogger(__name__) self.series_id = series_id + self.is_multiseries = is_multiseries # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -61,7 +63,10 @@ def __init__( seasonal_smoother += 1 self.forecast_summary = None - + # parameters = { + # "series_id": series_id, + # } + # parameters.update(kwargs) super().__init__( component_obj=None, random_seed=random_seed, @@ -167,15 +172,20 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) - # If there is not a series_index, give them one series id with the value 0 + # If there is not a series_id, give them one series id with the value 0 if self.series_id is None: self.series_id = "series_id" X[self.series_id] = 0 self.update_parameters({"series_id": self.series_id}) + else: + self.is_multiseries = True - # group the data by series_id + # Initialize the new "series_id" column in Woodwork + X.ww.init() + + # Group the data by series_id grouped_X = X.groupby(self.series_id) - # iterate through each id group + # Iterate through each id group self.decompositions = {} for id, series_X in grouped_X: series_y = y[series_X.index] @@ -195,7 +205,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: period = stl.period - dist = len(series_y) % self.period + dist = len(series_y) % period seasonality = ( ( res.seasonal[-(dist + period) : -dist] @@ -218,7 +228,7 @@ def transform( self, X: pd.DataFrame, y: pd.Series = None, - ) -> list(tuple[pd.DataFrame, pd.Series]): + ): """Transforms the target data by removing the STL trend and seasonality. Uses an ARIMA model to project forward the addititve trend and removes it. Then, utilizes the first period's @@ -230,22 +240,18 @@ def transform( y (pd.Series): Target variable to detrend and deseasonalize. Returns: - list of tuple of pd.DataFrame, pd.Series: The list of input features are returned without modification. The target + (Single series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target + variable y is detrended and deseasonalized. + (Multi series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target variable y is detrended and deseasonalized. Raises: ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ - # If there is not a series_index, give them one series id with the value 0 - # if self.series_index is None: - # self.series_index = "series_index" - # X.insert(0, self.series_index, 0) - # self.update_parameters({"series_index": self.series_index}) - # group the data by series_id - grouped_X = X.groupby(self.series_index) + grouped_X = X.groupby(self.series_id) - features = [] + features = {} for id, series_X in grouped_X: self.trend = self.decompositions[id]["trend"] self.seasonality = self.decompositions[id]["seasonality"] @@ -254,6 +260,7 @@ def transform( self.period = self.decompositions[id]["period"] series_y = y[series_X.index] + if series_y is None: return series_X, series_y original_index = series_y.index @@ -291,7 +298,12 @@ def transform( ) y_t = y_in_sample.append(y_out_of_sample) y_t.index = original_index - features.append((series_X, y_t)) + + if not self.is_multiseries: + return series_X, y_t + + features[id] = (series_X, y_t) + return features def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: @@ -486,7 +498,7 @@ def plot_decomposition( X: pd.DataFrame, y: pd.Series, show: bool = False, - ) -> list[tuple[plt.Figure, list]]: + ): """Plots the decomposition of the target signal. Args: @@ -496,15 +508,18 @@ def plot_decomposition( show (bool): Whether to display the plot or not. Defaults to False. Returns: - matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions + (Single series) matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions plotted on them + (Multi series) dict[matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]]: A dictionary that maps the series id to + the figure and axes that have the decompositions plotted on them + """ - # group the data by series_id + # Group the data by series_id grouped_X = X.groupby(self.series_id) # Iterate through each series id - plot_info = [] + plot_info = {} for id, series_X in grouped_X: self.trend = self.decompositions[id]["trend"] self.seasonality = self.decompositions[id]["seasonality"] @@ -514,7 +529,6 @@ def plot_decomposition( series_y = y[series_X.index] - # will need to change later since 'freq' var needs to be mutable series_X.index = pd.DatetimeIndex( series_X[self.time_index], freq=self.frequency, @@ -533,10 +547,13 @@ def plot_decomposition( axs[3].plot(decomposition_results[0]["residual"], "y") axs[3].set_title("residual") - fig.suptitle("Decomposition for Series {}".format(id)) - - plot_info.append((fig, axs)) + if self.is_multiseries: + fig.suptitle("Decomposition for Series {}".format(id)) + plot_info[id] = (fig, axs) + else: + plot_info = (fig, axs) if show: # pragma: no cover plt.show() + return plot_info From 3025b20204ecd03a702506fd13f39937db1b031b Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 28 Jul 2023 17:26:28 -0700 Subject: [PATCH 10/47] check if multiseris variable --- .../preprocessing/stl_decomposer.py | 103 +++++++++++------- 1 file changed, 64 insertions(+), 39 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 0afbe3efd5..c6ff749f5f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -63,10 +63,6 @@ def __init__( seasonal_smoother += 1 self.forecast_summary = None - # parameters = { - # "series_id": series_id, - # } - # parameters.update(kwargs) super().__init__( component_obj=None, random_seed=random_seed, @@ -188,7 +184,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: # Iterate through each id group self.decompositions = {} for id, series_X in grouped_X: - series_y = y[series_X.index] + + if y is None: + series_y = None + elif isinstance(series_X.index, pd.DatetimeIndex): + series_y = y[(series_X.reset_index(drop=True).index)] + else: + series_y = y[series_X.index] self.original_index = series_y.index if series_y is not None else None series_X, series_y = self._check_target(series_X, series_y) @@ -203,24 +205,29 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() - period = stl.period + self.period = stl.period - dist = len(series_y) % period - seasonality = ( + dist = len(series_y) % self.period + self.seasonal = res.seasonal + self.seasonality = ( ( - res.seasonal[-(dist + period) : -dist] + res.seasonal[-(dist + self.period) : -dist] if dist > 0 - else res.seasonal[-period:], + else res.seasonal[-self.period :], ), ) - self.decompositions[id] = { - "seasonal": res.seasonal, - "seasonality": seasonality, - "trend": res.trend, - "residual": res.resid, - "period": period, - } + self.trend = res.trend + self.residual = res.resid + + if self.is_multiseries: + self.decompositions[id] = { + "seasonal": self.seasonal, + "seasonality": self.seasonality, + "trend": self.trend, + "residual": self.resid, + "period": self.period, + } return self @@ -242,24 +249,31 @@ def transform( Returns: (Single series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target variable y is detrended and deseasonalized. - (Multi series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target + (Multi series) pd.DataFrame, pd.DataFrame: The list of input features are returned without modification. The target variable y is detrended and deseasonalized. Raises: ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ + if y is None: + return X, y # group the data by series_id grouped_X = X.groupby(self.series_id) - features = {} + features_list = [] + detrending_list = [] for id, series_X in grouped_X: - self.trend = self.decompositions[id]["trend"] - self.seasonality = self.decompositions[id]["seasonality"] - self.seasonal = self.decompositions[id]["seasonal"] - self.residual = self.decompositions[id]["residual"] - self.period = self.decompositions[id]["period"] - - series_y = y[series_X.index] + if self.is_multiseries: + self.trend = self.decompositions[id]["trend"] + self.seasonality = self.decompositions[id]["seasonality"] + self.seasonal = self.decompositions[id]["seasonal"] + self.residual = self.decompositions[id]["residual"] + self.period = self.decompositions[id]["period"] + + if isinstance(series_X.index, pd.DatetimeIndex): + series_y = y[(series_X.reset_index(drop=True).index)] + else: + series_y = y[series_X.index] if series_y is None: return series_X, series_y @@ -299,12 +313,18 @@ def transform( y_t = y_in_sample.append(y_out_of_sample) y_t.index = original_index + # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] if not self.is_multiseries: return series_X, y_t - features[id] = (series_X, y_t) + features_list.append({id: series_X}) + detrending_list.append({id: y_t}) - return features + # Convert the list to a DataFrame + # For multiseries, return tuple[pd.DataFrame, pd.Dataframe] where each column is a series_id + features_df = pd.DataFrame(features_list) + detrending_df = pd.DataFrame(detrending_list) + return features_df, detrending_df def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: """Adds back fitted trend and seasonality to target variable. @@ -521,18 +541,23 @@ def plot_decomposition( # Iterate through each series id plot_info = {} for id, series_X in grouped_X: - self.trend = self.decompositions[id]["trend"] - self.seasonality = self.decompositions[id]["seasonality"] - self.seasonal = self.decompositions[id]["seasonal"] - self.residual = self.decompositions[id]["residual"] - self.period = self.decompositions[id]["period"] - - series_y = y[series_X.index] + if self.is_multiseries: + self.trend = self.decompositions[id]["trend"] + self.seasonality = self.decompositions[id]["seasonality"] + self.seasonal = self.decompositions[id]["seasonal"] + self.residual = self.decompositions[id]["residual"] + self.period = self.decompositions[id]["period"] + + if isinstance(series_X.index, pd.DatetimeIndex): + series_y = y[(series_X.reset_index(drop=True).index)] + else: + series_y = y[series_X.index] - series_X.index = pd.DatetimeIndex( - series_X[self.time_index], - freq=self.frequency, - ) + if self.is_multiseries: + series_X.index = pd.DatetimeIndex( + series_X[self.time_index], + freq=self.frequency, + ) decomposition_results = self.get_trend_dataframe(series_X, series_y) From c7f1eddc2dd064d5d5effb18a35af945f7558e0a Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 31 Jul 2023 15:28:05 -0700 Subject: [PATCH 11/47] extend stldecomposer for multiseries --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 652f8eac74..d7014da76a 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233` + * Extend STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Changes * Unpinned sktime version :pr:`4214` From be2dd2dd154455c9e84365fabb7ac1c85853c476 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 31 Jul 2023 15:28:47 -0700 Subject: [PATCH 12/47] add null checks --- .../preprocessing/stl_decomposer.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index c6ff749f5f..f451568e61 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -172,7 +172,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: if self.series_id is None: self.series_id = "series_id" X[self.series_id] = 0 - self.update_parameters({"series_id": self.series_id}) else: self.is_multiseries = True @@ -204,17 +203,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() - + self.seasonal = res.seasonal self.period = stl.period - dist = len(series_y) % self.period - self.seasonal = res.seasonal self.seasonality = ( - ( - res.seasonal[-(dist + self.period) : -dist] - if dist > 0 - else res.seasonal[-self.period :], - ), + self.seasonal[-(dist + self.period) : -dist] + if dist > 0 + else self.seasonal[-self.period :] ) self.trend = res.trend @@ -257,23 +252,28 @@ def transform( """ if y is None: return X, y - # group the data by series_id - grouped_X = X.groupby(self.series_id) + + if not self.is_multiseries and X is not None: + self.series_id = "series_id" + X[self.series_id] = 0 + # If X is None, create a series with id=0 and series_X=None + grouped_X = {0: X}.items() if X is None else X.groupby(self.series_id) features_list = [] detrending_list = [] for id, series_X in grouped_X: if self.is_multiseries: - self.trend = self.decompositions[id]["trend"] self.seasonality = self.decompositions[id]["seasonality"] + self.trend = self.decompositions[id]["trend"] self.seasonal = self.decompositions[id]["seasonal"] self.residual = self.decompositions[id]["residual"] self.period = self.decompositions[id]["period"] - - if isinstance(series_X.index, pd.DatetimeIndex): - series_y = y[(series_X.reset_index(drop=True).index)] + if isinstance(series_X.index, pd.DatetimeIndex): + series_y = y[(series_X.reset_index(drop=True).index)] + else: + series_y = y[series_X.index] else: - series_y = y[series_X.index] + series_y = y if series_y is None: return series_X, series_y @@ -542,9 +542,9 @@ def plot_decomposition( plot_info = {} for id, series_X in grouped_X: if self.is_multiseries: - self.trend = self.decompositions[id]["trend"] self.seasonality = self.decompositions[id]["seasonality"] self.seasonal = self.decompositions[id]["seasonal"] + self.trend = self.decompositions[id]["trend"] self.residual = self.decompositions[id]["residual"] self.period = self.decompositions[id]["period"] From c2b60ac8af8d343a1e03dfc92307094af74a8ca7 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 15:59:41 -0400 Subject: [PATCH 13/47] Add stacking and unstacking utils for multiseries (#4250) * Add unstacking function * Add stacking function * Add tests for both functions --- docs/source/release_notes.rst | 1 + evalml/pipelines/utils.py | 106 ++++++++++++++++++ evalml/tests/conftest.py | 35 ++++++ .../pipeline_tests/test_pipeline_utils.py | 89 +++++++++++++++ 4 files changed, 231 insertions(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d7014da76a..cfe0c4b1e7 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233` + * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250` * Extend STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Changes diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index daccbf937c..c95a3bbc51 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -5,6 +5,7 @@ import black import featuretools as ft +import pandas as pd from woodwork import logical_types from evalml.data_checks import DataCheckActionCode, DataCheckActionOption @@ -1348,3 +1349,108 @@ def rows_of_interest( preds_value_proba = preds_value_proba[preds_value_proba <= epsilon] return preds_value_proba.index.tolist() + + +def unstack_multiseries( + X, + y, + series_id, + time_index, + target_name, + keep_time_in_index=True, +): + """Converts multiseries data with one series_id column and one target column to one target column per series id. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. + series_id (str): The column which identifies which series each row belongs to. + time_index (str): Specifies the name of the column in X that provides the datetime objects. + target_name (str): The name of the target column. + keep_time_in_index (bool): Whether to maintain the time index as the index of the returned dataframes. Defaults to True. + If set to false, will discard the time index information entirely. + + Returns: + pd.DataFrame, pd.DataFrame: The unstacked X and y data. + """ + # Combine X and y to make it easier to unstack + full_dataset = pd.concat([X, y.set_axis(X.index)], axis=1) + + # Get the total number of series, with their names + series_id_unique = full_dataset[series_id].unique() + + # Perform the unstacking + X_unstacked_cols = [] + y_unstacked_cols = [] + for s_id in series_id_unique: + single_series = full_dataset[full_dataset[series_id] == s_id] + + # Save the time_index for alignment + new_time_index = single_series[time_index] + for column_name in full_dataset.columns.drop([time_index, series_id]): + new_column = single_series[column_name] + new_column.index = new_time_index + new_column.name = f"{column_name}_{s_id}" + + if column_name == target_name: + y_unstacked_cols.append(new_column) + else: + X_unstacked_cols.append(new_column) + + # Concatenate all the single series to reform dataframes + X_unstacked = pd.concat(X_unstacked_cols, axis=1) + y_unstacked = pd.concat(y_unstacked_cols, axis=1) + + # Reset the axis if need be + if not keep_time_in_index: + X_unstacked.reset_index(drop=True, inplace=True) + y_unstacked.reset_index(drop=True, inplace=True) + + return X_unstacked, y_unstacked + + +def stack_data(data, include_series_id=False, series_id_name=None): + """Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True. + + Should only be used for data that is expected to be a single series. To stack multiple unstacked columns, + call this function multiple times on the desired subsets. + + Args: + data (pd.DataFrame): The data to stack. + include_series_id (bool): Whether or not to extract the series id and include it in a separate columns + series_id_name (str): If include_series_id is True, the series_id name to set for the column. The column + will be named 'series_id' if this parameter is None. + + Returns: + pd.Series or pd.DataFrame: The data in stacked series form. + """ + if data is None or isinstance(data, pd.Series): + return data + + stacked_series = data.stack(0) + + # Extract the original column name + series_id_with_name = stacked_series.index.droplevel() + stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1]) + + # If the index is the time index, keep it + if not data.index.is_numeric(): + new_time_index = data.index.unique().repeat(len(data.columns)) + # Otherwise, set it to unique integers + else: + new_time_index = pd.RangeIndex( + start=data.index[0], + stop=data.index[0] + len(stacked_series), + ) + stacked_series = stacked_series.set_axis(new_time_index) + + # Pull out the series id information, if requested + if include_series_id: + series_id_col = pd.Series( + series_id_with_name.map(lambda col_name: col_name.split("_")[-1]), + name=series_id_name or "series_id", + index=stacked_series.index, + ) + stacked_series = pd.concat([series_id_col, stacked_series], axis=1) + + return stacked_series diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 608ccda275..2d508bd53d 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1007,6 +1007,41 @@ def ts_data_seasonal_test(): return X, y +@pytest.fixture +def multiseries_ts_data_stacked(): + time_index = pd.date_range(start="1/1/2018", periods=20).repeat(5) + series_id = list(range(5)) * 20 + + X = pd.DataFrame( + { + "date": time_index, + "series_id": series_id, + "feature_a": range(100), + "feature_b": reversed(range(100)), + }, + ) + y = pd.Series(range(100)) + return X, y + + +@pytest.fixture +def multiseries_ts_data_unstacked(): + feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)}) + feature_b = pd.DataFrame( + {f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)}, + ) + X = pd.concat([feature_a, feature_b], axis=1) + + y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + + X.index = pd.date_range(start="1/1/2018", periods=20) + X.index.name = "date" + y.index = pd.date_range(start="1/1/2018", periods=20) + y.index.name = "date" + + return X, y + + @pytest.fixture def dummy_pipeline_hyperparameters(): return { diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 1e4313605a..d1b74d5283 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -54,6 +54,8 @@ make_pipeline, make_pipeline_from_actions, rows_of_interest, + stack_data, + unstack_multiseries, ) from evalml.problem_types import ProblemTypes, is_time_series @@ -1374,3 +1376,90 @@ def test_make_pipeline_features_and_dfs(X_y_binary): ) assert "DFS Transformer" == pipeline.component_graph.compute_order[0] + + +@pytest.mark.parametrize("target_name", ["target", "Target_Data"]) +@pytest.mark.parametrize("keep_time_in_index", [True, False]) +def test_unstack_multiseries( + target_name, + keep_time_in_index, + multiseries_ts_data_stacked, + multiseries_ts_data_unstacked, +): + X, y = multiseries_ts_data_stacked + X_unstacked, y_unstacked = multiseries_ts_data_unstacked + y.name = target_name + y_unstacked.columns = [ + f"{target_name}_{i}" for i in range(len(y_unstacked.columns)) + ] + if not keep_time_in_index: + X_unstacked.reset_index(drop=True, inplace=True) + y_unstacked.reset_index(drop=True, inplace=True) + + X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries( + X, + y, + "series_id", + "date", + target_name=target_name, + keep_time_in_index=keep_time_in_index, + ) + pd.testing.assert_frame_equal( + X_unstacked.sort_index(axis=1), + X_unstacked_transformed.sort_index(axis=1), + check_freq=False, + ) + pd.testing.assert_frame_equal( + y_unstacked, + y_unstacked_transformed, + check_freq=False, + ) + + +@pytest.mark.parametrize("include_series_id", [True, False]) +@pytest.mark.parametrize("series_id_name", [None, "SERIES"]) +@pytest.mark.parametrize("index_type", ["datetime", "int"]) +def test_stack_data( + include_series_id, + series_id_name, + index_type, + multiseries_ts_data_stacked, + multiseries_ts_data_unstacked, +): + _, y = multiseries_ts_data_unstacked + _, y_stacked = multiseries_ts_data_stacked + + y_stacked.name = "target" + + if index_type == "datetime": + y_stacked.index = pd.date_range(start="1/1/2018", periods=20).repeat(5) + y_stacked.index.name = "date" + else: + y = y.reset_index(drop=True) + + y_stacked_transformed = stack_data( + y, + include_series_id=include_series_id, + series_id_name=series_id_name, + ) + + if include_series_id: + series_id_name = series_id_name or "series_id" + series_id_col = pd.Series( + list(range(5)) * 20, + dtype="str", + index=y_stacked.index, + ) + y_stacked = pd.DataFrame({series_id_name: series_id_col, "target": y_stacked}) + pd.testing.assert_frame_equal(y_stacked, y_stacked_transformed) + + else: + pd.testing.assert_series_equal(y_stacked, y_stacked_transformed) + + +def test_stack_data_noop(): + none_y = None + series_y = pd.Series(range(100)) + + assert stack_data(none_y) is None + pd.testing.assert_series_equal(stack_data(series_y), series_y) From 781c139da6acc8a440e5f5121175dca190797668 Mon Sep 17 00:00:00 2001 From: Christopher Bunn Date: Thu, 27 Jul 2023 13:43:49 -0400 Subject: [PATCH 14/47] Add support for pandas 2 (#4216) * Squashed changes * Ignored index * Disabled column checking * Reverted deleted code * Updated pyproject.toml * Replaced version check code --- .github/meta.yaml | 2 +- core-requirements.txt | 2 +- docs/source/release_notes.rst | 1 + docs/source/user_guide/timeseries.ipynb | 4 +-- evalml/model_understanding/visualizations.py | 4 +-- .../transformers/encoders/onehot_encoder.py | 2 +- .../transformers/preprocessing/decomposer.py | 4 ++- .../preprocessing/polynomial_decomposer.py | 2 +- .../preprocessing/stl_decomposer.py | 2 +- evalml/preprocessing/utils.py | 6 ++-- .../decomposer_tests/test_decomposer.py | 18 +++++++++-- .../decomposer_tests/test_stl_decomposer.py | 9 +++++- .../test_datetime_featurizer.py | 11 ++++--- evalml/tests/component_tests/test_imputer.py | 32 ++++++++++++++++--- .../component_tests/test_lgbm_classifier.py | 18 +++++++++-- .../component_tests/test_lgbm_regressor.py | 9 +++++- .../component_tests/test_one_hot_encoder.py | 6 ++-- .../tests/component_tests/test_oversampler.py | 1 + .../component_tests/test_simple_imputer.py | 4 +-- .../component_tests/test_target_encoder.py | 2 +- .../test_time_series_imputer.py | 23 ++++++++++--- .../test_time_series_regularizer.py | 7 ++-- .../component_tests/test_undersampler.py | 3 +- .../test_class_imbalance_data_check.py | 8 ++--- .../test_datetime_format_data_check.py | 12 +++---- .../latest_dependency_versions.txt | 2 +- .../test_visualizations.py | 7 ++-- pyproject.toml | 2 +- 28 files changed, 146 insertions(+), 57 deletions(-) diff --git a/.github/meta.yaml b/.github/meta.yaml index b0a7ddce84..c977e102ed 100644 --- a/.github/meta.yaml +++ b/.github/meta.yaml @@ -25,7 +25,7 @@ outputs: - setuptools ==58.0.4 run: - numpy >=1.21.0 - - pandas >=1.5.0, <2.0.0 + - pandas >=1.5.0 - dask >=2022.2.0, !=2022.10.1 - scipy >=1.5.0 - scikit-learn >=1.2.2 diff --git a/core-requirements.txt b/core-requirements.txt index a719ddb846..09beb0d636 100644 --- a/core-requirements.txt +++ b/core-requirements.txt @@ -1,5 +1,5 @@ numpy>=1.21.0 -pandas>=1.5.0, <2.0.0 +pandas>=1.5.0 scipy>=1.5.0 scikit-learn>=1.2.1 scikit-optimize>=0.9.0 diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index cfe0c4b1e7..64255c1efe 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,6 +6,7 @@ Release Notes * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250` * Extend STLDecomposer to Support Multiseries :pr:`4253` * Fixes + * Added support for pandas 2 :pr:`4216` * Changes * Unpinned sktime version :pr:`4214` * Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237` diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb index fbd5e7df07..cec0f391b8 100644 --- a/docs/source/user_guide/timeseries.ipynb +++ b/docs/source/user_guide/timeseries.ipynb @@ -996,8 +996,8 @@ " ),\n", " # Plot prediction intervals\n", " go.Scatter(\n", - " x=X_forecast_dates[\"Date\"].append(X_forecast_dates[\"Date\"][::-1]),\n", - " y=y_upper.append(y_lower[::-1]),\n", + " x=pd.concat([X_forecast_dates[\"Date\"], X_forecast_dates[\"Date\"][::-1]]),\n", + " y=pd.concat([y_upper, y_lower[::-1]]),\n", " fill=\"toself\",\n", " fillcolor=\"rgba(255,0,0,0.2)\",\n", " line=dict(color=\"rgba(255,0,0,0.2)\"),\n", diff --git a/evalml/model_understanding/visualizations.py b/evalml/model_understanding/visualizations.py index e50812f383..6eb74b58d9 100644 --- a/evalml/model_understanding/visualizations.py +++ b/evalml/model_understanding/visualizations.py @@ -472,8 +472,8 @@ def get_linear_coefficients(estimator, features=None): coef_.name = "Coefficients" coef_.index = features coef_ = coef_.sort_values() - coef_ = pd.Series(estimator._component_obj.intercept_, index=["Intercept"]).append( - coef_, + coef_ = pd.concat( + [pd.Series(estimator._component_obj.intercept_, index=["Intercept"]), coef_], ) return coef_ diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 9b108203d0..434c4081d6 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -152,7 +152,7 @@ def fit(self, X, y=None): random_state=self._initial_state, ) value_counts = value_counts.sort_values( - [col], + value_counts.iloc[:, 0].name, ascending=False, kind="mergesort", ) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index abd9543f97..3f3d0e0718 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -151,7 +151,9 @@ def determine_periodicity( period is detected, returns None. """ - X, y = cls._handle_nullable_types(cls, X, y) + # Only need to handle nullable types on pandas < 2. Kept for backwards compatibility with pandas 1.x. + if int(pd.__version__.split(".")[0]) < 2: + X, y = cls._handle_nullable_types(cls, X, y) def _get_rel_max_from_acf(y): """Determines the relative maxima of the target's autocorrelation.""" diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py index 5a0db87aa4..f2ffb93366 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py @@ -267,7 +267,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: index=truncated_y_t.index, ), ) - y = y_in_sample.append(y_out_of_sample) + y = pd.concat([y_in_sample, y_out_of_sample]) y.index = original_index return y diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index f451568e61..294f3a3c7f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -396,7 +396,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: index=truncated_y_t.index, ), ) - y = y_in_sample.append(y_out_of_sample) + y = pd.concat([y_in_sample, y_out_of_sample]) y.index = original_index return y diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index a5022afa12..447db93aee 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -184,19 +184,17 @@ def target_distribution(targets): Examples: >>> y = pd.Series([1, 2, 4, 1, 3, 3, 1, 2]) - >>> target_distribution(y) + >>> print(target_distribution(y).to_string()) Targets 1 37.50% 2 25.00% 3 25.00% 4 12.50% - dtype: object >>> y = pd.Series([True, False, False, False, True]) - >>> target_distribution(y) + >>> print(target_distribution(y).to_string()) Targets False 60.00% True 40.00% - dtype: object """ distribution = targets.value_counts() / len(targets) return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets") diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index d5c1cfbf03..aaf924f626 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -292,9 +292,8 @@ def test_decomposer_build_seasonal_signal( X, _, y = ts_data() # Change the date time index to start at the same time but have different frequency - y.set_axis( + y = y.set_axis( pd.date_range(start="2021-01-01", periods=len(y), freq=frequency), - inplace=True, ) decomposer = decomposer_child_class(degree=2) @@ -497,7 +496,12 @@ def test_decomposer_determine_periodicity( True, pytest.param( False, - marks=pytest.mark.xfail(strict=True, raises=AssertionError), + marks=pytest.mark.xfail( + condition=int(pd.__version__.split(".")[0]) < 2, + strict=True, + raises=AssertionError, + reason="pandas 1.x does not recognize np.Nan in Float64 subtracted_floats.", + ), ), ], ) @@ -749,12 +753,20 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) else: output_inverse_y = decomposer.inverse_transform(y_t_new) + # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, + # we need to test the indices equivalence separately. pd.testing.assert_series_equal( y[y_t_new.index], output_inverse_y, check_exact=False, + check_index=False, rtol=1.0e-1, ) + pd.testing.assert_index_equal( + y[y_t_new.index].index, + output_inverse_y.index, + exact=False, + ) @pytest.mark.parametrize( diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 84b1635b58..afedf2c686 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -181,13 +181,20 @@ def test_stl_decomposer_inverse_transform( ): output_inverse_y = decomposer.inverse_transform(y_t_new) else: + # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, + # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) pd.testing.assert_series_equal( y[y_t_new.index], output_inverse_y, - check_exact=False, + check_index=False, rtol=1.0e-2, ) + pd.testing.assert_index_equal( + y[y_t_new.index].index, + output_inverse_y.index, + exact=False, + ) @pytest.mark.parametrize( diff --git a/evalml/tests/component_tests/test_datetime_featurizer.py b/evalml/tests/component_tests/test_datetime_featurizer.py index d3755dadf4..d853a1181b 100644 --- a/evalml/tests/component_tests/test_datetime_featurizer.py +++ b/evalml/tests/component_tests/test_datetime_featurizer.py @@ -77,10 +77,10 @@ def test_datetime_featurizer_encodes_as_ints(): # Test that changing encode_as_categories to True only changes the dtypes but not the values dt_with_cats = DateTimeFeaturizer(encode_as_categories=True) X_transformed_df = dt_with_cats.fit_transform(X) - expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]) - expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]) + expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]).astype("category") + expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]).astype("category") - assert_frame_equal(expected, X_transformed_df) + assert_frame_equal(expected, X_transformed_df, check_categorical=False) assert dt_with_cats.get_feature_names() == feature_names # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined @@ -250,7 +250,10 @@ def test_datetime_featurizer_no_datetime_cols(): def test_datetime_featurizer_numpy_array_input(): datetime_transformer = DateTimeFeaturizer() - X = np.array([["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], dtype="datetime64") + X = np.array( + [["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], + dtype="datetime64[ns]", + ) datetime_transformer.fit(X) assert list(datetime_transformer.transform(X).columns) == [ "0_year", diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index 8cd3bc6a91..4e39ab1bcc 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -219,11 +219,21 @@ def test_drop_all_columns(imputer_test_data): imputer.fit(X, y) transformed = imputer.transform(X, y) expected = X.drop(["all nan cat", "all nan"], axis=1) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) imputer = Imputer() transformed = imputer.fit_transform(X, y) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) def test_typed_imputer_numpy_input(): @@ -271,11 +281,21 @@ def test_imputer_empty_data(data_type, make_data_type): imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) imputer = Imputer() transformed = imputer.fit_transform(X, y) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) def test_imputer_does_not_reset_index(): @@ -508,7 +528,9 @@ def test_imputer_with_none_separated( for col in set(columns_dict["categoricals_only"]).intersection( set(X_test.columns), ): - expected_df[col].cat.add_categories(categorical_fill_value, inplace=True) + expected_df[col] = expected_df[col].cat.add_categories( + categorical_fill_value, + ) expected_df[col].iloc[-1:] = categorical_fill_value if boolean_impute_strategy == "constant": for col in set(columns_dict["booleans_only"]).intersection(set(X_test.columns)): diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 81a6c0546b..4b7e99c238 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -164,11 +164,25 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): clf.predict(X) arg_X = mock_predict.call_args[0][0] - assert_frame_equal(X_expected, arg_X) + # Index type checking ignored so the test can pass on Windows + # X_expected is int32, arg_X is int64 + assert_frame_equal( + X_expected, + arg_X, + check_index_type=False, + check_column_type=False, + ) clf.predict_proba(X) arg_X = mock_predict_proba.call_args[0][0] - assert_frame_equal(X_expected, arg_X) + # Index type checking ignored so the test can pass on Windows + # X_expected is int32, arg_X is int64 + assert_frame_equal( + X_expected, + arg_X, + check_index_type=False, + check_column_type=False, + ) @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba") diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 93ec2d9687..273eac9a43 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -118,7 +118,14 @@ def test_correct_args(mock_predict, X_y_regression): clf.predict(X) arg_X = mock_predict.call_args[0][0] - assert_frame_equal(X_expected, arg_X) + # Index type checking ignored so the test can pass on Windows + # X_expected is int32, arg_X is int64 + assert_frame_equal( + X_expected, + arg_X, + check_index_type=False, + check_column_type=False, + ) @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 80ed613fb5..0e64d9ef72 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -420,7 +420,7 @@ def test_more_top_n_unique_values(): col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values( - ["col_1"], + col_1_counts.iloc[:, 0].name, ascending=False, kind="mergesort", ) @@ -429,7 +429,7 @@ def test_more_top_n_unique_values(): col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed) col_2_counts = col_2_counts.sort_values( - ["col_2"], + col_2_counts.iloc[:, 0].name, ascending=False, kind="mergesort", ) @@ -466,7 +466,7 @@ def test_more_top_n_unique_values_large(): col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values( - ["col_1"], + col_1_counts.iloc[:, 0].name, ascending=False, kind="mergesort", ) diff --git a/evalml/tests/component_tests/test_oversampler.py b/evalml/tests/component_tests/test_oversampler.py index 81cee73a58..8e26841820 100644 --- a/evalml/tests/component_tests/test_oversampler.py +++ b/evalml/tests/component_tests/test_oversampler.py @@ -109,6 +109,7 @@ def test_oversample_imbalanced_binary(data_type, oversampler_type, make_data_typ value_counts, pd.Series([850, 850]), check_dtype=False, + check_names=False, ) oversampler = Oversampler(sampling_ratio=1) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 2aa8d13da9..737e84a7b5 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -557,14 +557,14 @@ def test_simple_imputer_ignores_natural_language( if df_composition == "full_df": if numeric_impute_strategy == "mean" and has_nan == "has_nan": - ans = X_df.mean() + ans = X_df.mean(numeric_only=True) ans["natural language col"] = pd.NA X_df = X_df.astype( {"int col": float}, ) X_df.iloc[-1, :] = ans elif numeric_impute_strategy == "median" and has_nan == "has_nan": - ans = X_df.median() + ans = X_df.median(numeric_only=True) ans["natural language col"] = pd.NA X_df = X_df.astype( {"int col": float}, diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py index bc7086ed82..f5fbc5f334 100644 --- a/evalml/tests/component_tests/test_target_encoder.py +++ b/evalml/tests/component_tests/test_target_encoder.py @@ -144,7 +144,7 @@ def test_cols(): ), }, ) - assert_frame_equal(X_expected, X_t, check_less_precise=True) + assert_frame_equal(X_expected, X_t, rtol=1e-3) encoder = TargetEncoder(cols=["col_3"]) encoder.fit(X, y) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index c27adffdc3..fc3e1d4f3c 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -241,11 +241,21 @@ def test_imputer_drops_columns_with_all_nan(imputer_test_data): imputer.fit(X, y) transformed, _ = imputer.transform(X, y) expected = X.drop(["all nan cat", "all nan"], axis=1) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) imputer = TimeSeriesImputer() transformed, _ = imputer.fit_transform(X, y) - assert_frame_equal(transformed, expected, check_dtype=False) + assert_frame_equal( + transformed, + expected, + check_column_type=False, + check_index_type=False, + ) def test_typed_imputer_numpy_input(): @@ -277,12 +287,17 @@ def test_imputer_empty_data(data_type, make_data_type): imputer = TimeSeriesImputer() imputer.fit(X, y) X_t, y_t = imputer.transform(X, y) - assert_frame_equal(X_t, X_expected, check_dtype=False) + assert_frame_equal(X_t, X_expected, check_column_type=False, check_index_type=False) assert_series_equal(y_t, y_expected, check_dtype=False) imputer = TimeSeriesImputer() transformed, _ = imputer.fit_transform(X, y) - assert_frame_equal(transformed, X_expected, check_dtype=False) + assert_frame_equal( + transformed, + X_expected, + check_column_type=False, + check_index_type=False, + ) def test_imputer_does_not_reset_index(): diff --git a/evalml/tests/component_tests/test_time_series_regularizer.py b/evalml/tests/component_tests/test_time_series_regularizer.py index 2c92177618..45d33f68b5 100644 --- a/evalml/tests/component_tests/test_time_series_regularizer.py +++ b/evalml/tests/component_tests/test_time_series_regularizer.py @@ -56,10 +56,13 @@ def assert_features_and_length_equal( dates_to_test = set(non_nan_X["dates"]) - ignore_dates rand_date = np.random.choice(list(dates_to_test), 5, replace=False) for each_date in rand_date: - input_feat = X.loc[X["dates"] == each_date, set(X.columns) - {"dates"}].iloc[0] + input_feat = X.loc[ + X["dates"] == each_date, + list(set(X.columns) - {"dates"}), + ].iloc[0] outout_feat = non_nan_X.loc[ non_nan_X["dates"] == each_date, - set(non_nan_X.columns) - {"dates"}, + list(set(non_nan_X.columns) - {"dates"}), ].iloc[0] pd.testing.assert_series_equal( pd.Series(input_feat.values), diff --git a/evalml/tests/component_tests/test_undersampler.py b/evalml/tests/component_tests/test_undersampler.py index 532b600ca9..13803126eb 100644 --- a/evalml/tests/component_tests/test_undersampler.py +++ b/evalml/tests/component_tests/test_undersampler.py @@ -79,8 +79,9 @@ def test_undersampler_imbalanced_output(data_type, make_data_type): assert value_counts.values[1] / value_counts.values[0] == sampling_ratio pd.testing.assert_series_equal( value_counts, - pd.Series([600, 150], index=[1, 0]), + pd.Series([600, 150], index=[1, 0], name="count"), check_dtype=False, + check_names=False, ) undersampler = Undersampler(sampling_ratio=sampling_ratio) diff --git a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py index d4fb45c4fe..0d3cfb3083 100644 --- a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py +++ b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py @@ -533,12 +533,12 @@ def test_class_imbalance_large_multiclass(test_size): [0] * 20 + [1] * 25 + [2] * 99 + [3] * 105 + [4] * 900 + [5] * 900, ) y_multiclass_huge = pd.Series([i % 200 for i in range(100000)]) - y_imbalanced_multiclass_huge = y_multiclass_huge.append( - pd.Series([200] * 10), + y_imbalanced_multiclass_huge = pd.concat( + [y_multiclass_huge, pd.Series([200] * 10)], ignore_index=True, ) - y_imbalanced_multiclass_nan = y_multiclass_huge.append( - pd.Series([np.nan] * 10), + y_imbalanced_multiclass_nan = pd.concat( + [y_multiclass_huge, pd.Series([np.nan] * 10)], ignore_index=True, ) diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index e9a8f3f400..4d864a6eb5 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -244,7 +244,7 @@ def test_datetime_format_data_check_multiple_errors(): + ["2021-01-31", "2021-02-02", "2021-02-04"] + pd.date_range("2021-02-05", periods=90).tolist() ) - X = pd.DataFrame({"dates": dates}) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") y = pd.Series(range(21)) datetime_format_check = DateTimeFormatDataCheck(datetime_column="dates") @@ -269,7 +269,7 @@ def test_datetime_format_data_check_multiple_errors(): + ["2021-01-09", "2021-01-31", "2021-02-02", "2021-02-04"] + pd.date_range("2021-02-05", periods=90).tolist() ) - X = pd.DataFrame({"dates": dates}) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") ww_payload = infer_frequency( X["dates"], @@ -297,7 +297,7 @@ def test_datetime_format_data_check_multiple_errors(): .drop("2021-01-10") .append(pd.date_range("2021-01-15", periods=86)) ) - X = pd.DataFrame({"dates": dates}) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") ww_payload = infer_frequency( X["dates"], @@ -326,7 +326,7 @@ def test_datetime_format_data_check_multiple_errors(): .append(pd.date_range("2021-01-30", periods=1)) .append(pd.date_range("2021-01-31", periods=86, freq="2D")) ) - X = pd.DataFrame({"dates": dates}) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") ww_payload = infer_frequency( X["dates"], @@ -355,7 +355,7 @@ def test_datetime_format_data_check_multiple_errors(): .append(pd.date_range("2021-01-30", periods=1)) .append(pd.date_range("2021-01-31", periods=86, freq="2D")) ) - X = pd.DataFrame({"dates": dates}) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") ww_payload = infer_frequency( X["dates"], @@ -520,7 +520,7 @@ def test_datetime_many_duplicates_and_nans(): dates = pd.Series(pd.date_range(start="1/1/2021", periods=76)) nans = pd.Series([None] * 12) duplicates = pd.Series(pd.date_range(start="1/1/2021", periods=12)) - dates = dates.append(nans).append(duplicates) + dates = pd.concat([dates, nans, duplicates]) X = pd.DataFrame({"date": dates}, columns=["date"]) X = X.reset_index(drop=True) diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 6e493c24ef..c52c7da97e 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -20,7 +20,7 @@ networkx==3.1 nlp-primitives==2.11.0 numpy==1.24.4 packaging==23.1 -pandas==1.5.3 +pandas==2.0.3 plotly==5.15.0 pmdarima==2.0.3 pyzmq==25.1.0 diff --git a/evalml/tests/model_understanding_tests/test_visualizations.py b/evalml/tests/model_understanding_tests/test_visualizations.py index 3f50ff1f90..66ac019c57 100644 --- a/evalml/tests/model_understanding_tests/test_visualizations.py +++ b/evalml/tests/model_understanding_tests/test_visualizations.py @@ -667,8 +667,11 @@ def test_linear_coefficients_output(estimator): assert list(output_.index) == ["Intercept", "Second", "Fourth", "First", "Third"] assert output_.shape[0] == X.shape[1] + 1 assert ( - pd.Series(est_._component_obj.intercept_, index=["Intercept"]).append( - pd.Series(est_.feature_importance).sort_values(), + pd.concat( + [ + pd.Series(est_._component_obj.intercept_, index=["Intercept"]), + pd.Series(est_.feature_importance).sort_values(), + ], ) == output_.values ).all() diff --git a/pyproject.toml b/pyproject.toml index 0927a196ec..f2d15a0dfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ license = {file = "LICENSE"} requires-python = ">=3.8,<4" dependencies = [ "numpy >= 1.21.0", - "pandas >= 1.5.0, <2.0.0", + "pandas >= 1.5.0", "scipy >= 1.5.0", "scikit-learn >= 1.2.2", "scikit-optimize >= 0.9.0", From 4a8cc0f91ba0b36971fe2cf5de3243d33dc9a35a Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 31 Jul 2023 15:54:17 -0700 Subject: [PATCH 15/47] reset condition for period --- .../components/transformers/preprocessing/stl_decomposer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 294f3a3c7f..d7588a735e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -199,7 +199,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: # Save the frequency of the fitted series for checking against transform data. self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) # Determine the period of the seasonal component - self.set_period(series_X, series_y) + if self.is_multiseries or self.period is None: + self.set_period(series_X, series_y) stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() @@ -220,7 +221,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: "seasonal": self.seasonal, "seasonality": self.seasonality, "trend": self.trend, - "residual": self.resid, + "residual": self.residual, "period": self.period, } From a8c2445da9d055cc3d43676ad2cfa05ce8ab6425 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 2 Aug 2023 18:06:11 -0700 Subject: [PATCH 16/47] take dataframe as y input and fix indexing --- .../preprocessing/stl_decomposer.py | 84 +++++++------------ 1 file changed, 30 insertions(+), 54 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index c45f01c4c8..ab4da4c219 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -137,7 +137,7 @@ def _project_trend_and_seasonality(self, y): ) return projected_trend, projected_seasonality - def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: + def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: """Fits the STLDecomposer and determine the seasonal signal. Instantiates a statsmodels STL decompose object with the component's stored @@ -167,40 +167,30 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: self.logger.warning( f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) + X, y = self._check_target(X, y) # If there is not a series_id, give them one series id with the value 0 - if self.series_id is None: - self.series_id = "series_id" - X[self.series_id] = 0 - else: + if self.series_id: self.is_multiseries = True - # Initialize the new "series_id" column in Woodwork - X.ww.init() + if isinstance(y, pd.Series): + y = y.to_frame() - # Group the data by series_id - grouped_X = X.groupby(self.series_id) # Iterate through each id group self.decompositions = {} - for id, series_X in grouped_X: - - if y is None: - series_y = None - elif isinstance(series_X.index, pd.DatetimeIndex): - series_y = y[(series_X.reset_index(drop=True).index)] - else: - series_y = y[series_X.index] + for id in y.columns: + series_y = y[id] self.original_index = series_y.index if series_y is not None else None - series_X, series_y = self._check_target(series_X, series_y) + X, series_y = self._check_target(X, series_y) self._map_dt_to_integer(self.original_index, series_y.index) # Save the frequency of the fitted series for checking against transform data. self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) # Determine the period of the seasonal component - if self.is_multiseries or self.period is None: - self.set_period(series_X, series_y) + # if self.is_multiseries or self.period is None: + self.set_period(X, series_y) stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() @@ -230,7 +220,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: def transform( self, X: pd.DataFrame, - y: pd.Series = None, + y: pd.DataFrame = None, ): """Transforms the target data by removing the STL trend and seasonality. @@ -254,32 +244,26 @@ def transform( if y is None: return X, y - if not self.is_multiseries and X is not None: - self.series_id = "series_id" - X[self.series_id] = 0 - # If X is None, create a series with id=0 and series_X=None - grouped_X = {0: X}.items() if X is None else X.groupby(self.series_id) + if isinstance(y, pd.Series): + y = y.to_frame() features_list = [] detrending_list = [] - for id, series_X in grouped_X: + # Iterate through each id group + self.decompositions = {} + for id in y.columns: + series_y = y[id] if self.is_multiseries: self.seasonality = self.decompositions[id]["seasonality"] self.trend = self.decompositions[id]["trend"] self.seasonal = self.decompositions[id]["seasonal"] self.residual = self.decompositions[id]["residual"] self.period = self.decompositions[id]["period"] - if isinstance(series_X.index, pd.DatetimeIndex): - series_y = y[(series_X.reset_index(drop=True).index)] - else: - series_y = y[series_X.index] - else: - series_y = y if series_y is None: - return series_X, series_y + return X, series_y original_index = series_y.index - series_X, series_y = self._check_target(series_X, series_y) + X, series_y = self._check_target(X, series_y) self._check_oos_past(series_y) y_in_sample = pd.Series([]) @@ -316,9 +300,9 @@ def transform( # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] if not self.is_multiseries: - return series_X, y_t + return X, y_t - features_list.append({id: series_X}) + features_list.append({id: X}) detrending_list.append({id: y_t}) # Convert the list to a DataFrame @@ -327,7 +311,6 @@ def transform( detrending_df = pd.DataFrame(detrending_list) return features_df, detrending_df - def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: """Adds back fitted trend and seasonality to target variable. @@ -518,7 +501,7 @@ def get_trend_prediction_intervals(self, y, coverage=None): def plot_decomposition( self, X: pd.DataFrame, - y: pd.Series, + y: pd.DataFrame, show: bool = False, ): """Plots the decomposition of the target signal. @@ -537,31 +520,24 @@ def plot_decomposition( """ - # Group the data by series_id - grouped_X = X.groupby(self.series_id) + if isinstance(y, pd.Series): + y = y.to_frame() # Iterate through each series id plot_info = {} - for id, series_X in grouped_X: + for id in y.columns: + print(id) + series_y = y[id] + if self.is_multiseries: + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) self.seasonality = self.decompositions[id]["seasonality"] self.seasonal = self.decompositions[id]["seasonal"] self.trend = self.decompositions[id]["trend"] self.residual = self.decompositions[id]["residual"] self.period = self.decompositions[id]["period"] - if isinstance(series_X.index, pd.DatetimeIndex): - series_y = y[(series_X.reset_index(drop=True).index)] - else: - series_y = y[series_X.index] - - if self.is_multiseries: - series_X.index = pd.DatetimeIndex( - series_X[self.time_index], - freq=self.frequency, - ) - - decomposition_results = self.get_trend_dataframe(series_X, series_y) + decomposition_results = self.get_trend_dataframe(X, series_y) fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) From 13c5d29d60ae9534341d3110b719a250d3faaf16 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 3 Aug 2023 08:44:59 -0700 Subject: [PATCH 17/47] fix lint --- evalml/tests/pipeline_tests/test_pipeline_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index d193de05b0..f2c51c6b2f 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1379,7 +1379,6 @@ def test_make_pipeline_features_and_dfs(X_y_binary): @pytest.mark.parametrize("target_name", ["target", "Target_Data"]) - def test_unstack_multiseries( target_name, multiseries_ts_data_stacked, @@ -1413,7 +1412,6 @@ def test_unstack_multiseries( @pytest.mark.parametrize("include_series_id", [True, False]) @pytest.mark.parametrize("series_id_name", [None, "SERIES"]) - def test_stack_data( include_series_id, series_id_name, From 354c95a957f062cc69ed27948e0e73e0753e01b3 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 3 Aug 2023 09:43:23 -0700 Subject: [PATCH 18/47] formatting --- evalml/tests/pipeline_tests/test_pipeline_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index f2c51c6b2f..2064dcc835 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1422,7 +1422,6 @@ def test_stack_data( _, y_stacked = multiseries_ts_data_stacked y_stacked.name = "target" - y_stacked_transformed = stack_data( y, include_series_id=include_series_id, From a50610199ba92e965190b2503c5a968ab61fd4ef Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 3 Aug 2023 10:58:41 -0700 Subject: [PATCH 19/47] remove print statement --- .../components/transformers/preprocessing/stl_decomposer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index ab4da4c219..3eba29dc95 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -526,7 +526,6 @@ def plot_decomposition( # Iterate through each series id plot_info = {} for id in y.columns: - print(id) series_y = y[id] if self.is_multiseries: From 5745a106b68b768a7e110fd3aa10023b675d1661 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 3 Aug 2023 12:27:15 -0700 Subject: [PATCH 20/47] pd 2 support --- .../components/transformers/preprocessing/stl_decomposer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 3eba29dc95..0c7541184c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -295,7 +295,7 @@ def transform( index=truncated_y.index, ), ) - y_t = y_in_sample.append(y_out_of_sample) + y_t = pd.concat([y_in_sample, y_out_of_sample]) y_t.index = original_index # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] From 3363e10a0cb8f361d3d41218766ae211eedf6ef1 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 7 Aug 2023 15:07:01 -0400 Subject: [PATCH 21/47] update inverse_transform and get_trend_dataframe --- .../preprocessing/stl_decomposer.py | 220 ++++++++++-------- 1 file changed, 122 insertions(+), 98 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 0c7541184c..bda697bf04 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -47,12 +47,10 @@ def __init__( period: int = None, seasonal_smoother: int = 7, random_seed: int = 0, - is_multiseries: bool = False, **kwargs, ): self.logger = logging.getLogger(__name__) self.series_id = series_id - self.is_multiseries = is_multiseries # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -63,13 +61,19 @@ def __init__( seasonal_smoother += 1 self.forecast_summary = None + parameters = { + "degree": degree, + "period": period, + "seasonal_smoother": seasonal_smoother, + "time_index": time_index, + "series_id": series_id, + } + parameters.update(kwargs) + super().__init__( component_obj=None, random_seed=random_seed, - degree=degree, - period=period, - seasonal_smoother=seasonal_smoother, - time_index=time_index, + **parameters, **kwargs, ) @@ -153,7 +157,7 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: Args: X (pd.DataFrame, optional): Conditionally used to build datetime index. - y (pd.Series): Target variable to detrend and deseasonalize. + y (pd.Series or pd.DataFrame): Target variable to detrend and deseasonalize. Returns: self @@ -169,13 +173,14 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: ) X, y = self._check_target(X, y) - # If there is not a series_id, give them one series id with the value 0 - if self.series_id: - self.is_multiseries = True - if isinstance(y, pd.Series): y = y.to_frame() + # If there is a series_id in stacked data or more than one column in unstacked data, set multiseries to true + is_multiseries = False + if self.series_id or len(y.columns) > 1: + is_multiseries = True + # Iterate through each id group self.decompositions = {} for id in y.columns: @@ -189,8 +194,8 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: # Save the frequency of the fitted series for checking against transform data. self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) # Determine the period of the seasonal component - # if self.is_multiseries or self.period is None: - self.set_period(X, series_y) + if is_multiseries or self.period is None: + self.set_period(X, series_y) stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) res = stl.fit() @@ -206,7 +211,7 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: self.trend = res.trend self.residual = res.resid - if self.is_multiseries: + if is_multiseries: self.decompositions[id] = { "seasonal": self.seasonal, "seasonality": self.seasonality, @@ -230,7 +235,7 @@ def transform( Args: X (pd.DataFrame, optional): Conditionally used to build datetime index. - y (pd.Series): Target variable to detrend and deseasonalize. + y (pd.Series or pd.DataFrame): Target variable to detrend and deseasonalize. Returns: (Single series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target @@ -250,10 +255,9 @@ def transform( features_list = [] detrending_list = [] # Iterate through each id group - self.decompositions = {} for id in y.columns: series_y = y[id] - if self.is_multiseries: + if len(y.columns) > 1: self.seasonality = self.decompositions[id]["seasonality"] self.trend = self.decompositions[id]["trend"] self.seasonal = self.decompositions[id]["seasonal"] @@ -299,7 +303,7 @@ def transform( y_t.index = original_index # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] - if not self.is_multiseries: + if len(y.columns) <= 1: return X, y_t features_list.append({id: X}) @@ -311,17 +315,17 @@ def transform( detrending_df = pd.DataFrame(detrending_list) return features_df, detrending_df - def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: + def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """Adds back fitted trend and seasonality to target variable. The STL trend is projected to cover the entire requested target range, then added back into the signal. Then, the seasonality is projected forward to and added back into the signal. Args: - y_t (pd.Series): Target variable. + y_t (pd.Series or pd.DataFrame): Target variable. Returns: - tuple of pd.DataFrame, pd.Series: The first element are the input features returned without modification. + tuple of pd.DataFrame, pd.DataFrame: The first element are the input features returned without modification. The second element is the target variable y with the trend and seasonality added back in. Raises: @@ -334,54 +338,67 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: y_t = infer_feature_types(y_t) self._check_oos_past(y_t) - index = self._choose_proper_index(y_t) + if isinstance(y_t, pd.Series): + y_t = y_t.to_frame() - y_in_sample = pd.Series([]) - y_out_of_sample = pd.Series([]) + y_in_sample_series = pd.Series([]) + y_out_of_sample_series = pd.Series([]) + for id in y_t.columns: + y_in_sample = pd.Series([]) + y_out_of_sample = pd.Series([]) + series_y = y_t[id] - # For partially and wholly in-sample data, retrieve stored results. - if index[0] <= y_t.index[0] <= index[-1]: - left_index = y_t.index[0] - right_index = ( - y_t.index[-1] + 1 - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else y_t.index[-1] + 1 * y_t.index.freq - ) - trend = ( - self.trend.reset_index(drop=True)[left_index:right_index] - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.trend[left_index:right_index] - ) - seasonal = ( - self.seasonal.reset_index( - drop=True, - )[left_index:right_index] - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.seasonal[left_index:right_index] - ) - y_in_sample = y_t + trend + seasonal - y_in_sample = y_in_sample.dropna() - - # For out of sample data.... - if y_t.index[-1] > index[-1]: - try: - # ...that is partially out of sample and partially in sample. - truncated_y_t = y_t[y_t.index.get_loc(index[-1]) + 1 :] - except KeyError: - # ...that is entirely out of sample. - truncated_y_t = y_t - ( - projected_trend, - projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y_t) - - y_out_of_sample = infer_feature_types( - pd.Series( - truncated_y_t + projected_trend + projected_seasonality, - index=truncated_y_t.index, - ), - ) - y = pd.concat([y_in_sample, y_out_of_sample]) + index = self._choose_proper_index(series_y) + + # For partially and wholly in-sample data, retrieve stored results. + if index[0] <= series_y.index[0] <= index[-1]: + left_index = series_y.index[0] + right_index = ( + series_y.index[-1] + 1 + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else series_y.index[-1] + 1 * series_y.index.freq + ) + trend = ( + self.trend.reset_index(drop=True)[left_index:right_index] + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else self.trend[left_index:right_index] + ) + seasonal = ( + self.seasonal.reset_index(drop=True)[left_index:right_index] + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else self.seasonal[left_index:right_index] + ) + y_in_sample = series_y + trend + seasonal + y_in_sample = y_in_sample.dropna() + y_in_sample_series = pd.concat([y_in_sample_series, y_in_sample]) + + # For out of sample data.... + if series_y.index[-1] > index[-1]: + try: + # ...that is partially out of sample and partially in sample. + truncated_y_t = series_y[series_y.index.get_loc(index[-1]) + 1 :] + except KeyError: + # ...that is entirely out of sample. + truncated_y_t = series_y + ( + projected_trend, + projected_seasonality, + ) = self._project_trend_and_seasonality(truncated_y_t) + + y_out_of_sample = infer_feature_types( + pd.Series( + truncated_y_t + projected_trend + projected_seasonality, + index=truncated_y_t.index, + ), + ) + y_out_of_sample_series = pd.concat( + [y_out_of_sample_series, y_out_of_sample], + ) # Corrected this line + + y = pd.concat([y_in_sample_series, y_out_of_sample_series]) y.index = original_index return y @@ -406,12 +423,7 @@ def get_trend_dataframe(self, X, y): """ X = infer_feature_types(X) - if not isinstance(X.index, pd.DatetimeIndex): - raise TypeError("Provided X should have datetimes in the index.") - if X.index.freq is None: - raise ValueError( - "Provided DatetimeIndex of X should have an inferred frequency.", - ) + # Change the y index to a matching datetimeindex or else we get a failure # in ForecastingHorizon during decomposition. if not isinstance(y.index, pd.DatetimeIndex): @@ -419,8 +431,6 @@ def get_trend_dataframe(self, X, y): self._check_oos_past(y) - result_dfs = [] - def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" if len(y.index) == len(self.trend.index) and all( @@ -449,17 +459,41 @@ def _decompose_target(X, y, fh): ) if isinstance(y, pd.Series): - result_dfs.append(_decompose_target(X, y, None)) - elif isinstance(y, pd.DataFrame): - for colname in y.columns: - result_dfs.append(_decompose_target(X, y[colname], None)) - return result_dfs + y = y.to_frame() + series_results = {} + # Iterate through each series id + for id in y.columns: + result_dfs = [] + if not isinstance(X.index, pd.DatetimeIndex): + raise TypeError("Provided X should have datetimes in the index.") + if X.index.freq is None: + raise ValueError( + "Provided DatetimeIndex of X should have an inferred frequency.", + ) + + # if it is multiseries, set the frequency and get values per series + if len(y.columns) > 1: + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) + self.seasonality = self.decompositions[id]["seasonality"] + self.seasonal = self.decompositions[id]["seasonal"] + self.trend = self.decompositions[id]["trend"] + self.residual = self.decompositions[id]["residual"] + self.period = self.decompositions[id]["period"] + + series_y = y[id] + if isinstance(series_y, pd.Series): + result_dfs.append(_decompose_target(X, series_y, None)) + elif isinstance(series_y, pd.DataFrame): + for colname in series_y.columns: + result_dfs.append(_decompose_target(X, series_y[colname], None)) + series_results[id] = result_dfs + return series_results def get_trend_prediction_intervals(self, y, coverage=None): """Calculate the prediction intervals for the trend data. Args: - y (pd.Series): Target data. + y (pd.Series or pd.DataFrame): Target data. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. @@ -525,31 +559,21 @@ def plot_decomposition( # Iterate through each series id plot_info = {} - for id in y.columns: - series_y = y[id] - - if self.is_multiseries: - X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) - self.seasonality = self.decompositions[id]["seasonality"] - self.seasonal = self.decompositions[id]["seasonal"] - self.trend = self.decompositions[id]["trend"] - self.residual = self.decompositions[id]["residual"] - self.period = self.decompositions[id]["period"] - - decomposition_results = self.get_trend_dataframe(X, series_y) + decomposition_results = self.get_trend_dataframe(X, y) + for id in y.columns: fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) - axs[0].plot(decomposition_results[0]["signal"], "r") + axs[0].plot(decomposition_results[id][0]["signal"], "r") axs[0].set_title("signal") - axs[1].plot(decomposition_results[0]["trend"], "b") + axs[1].plot(decomposition_results[id][0]["trend"], "b") axs[1].set_title("trend") - axs[2].plot(decomposition_results[0]["seasonality"], "g") + axs[2].plot(decomposition_results[id][0]["seasonality"], "g") axs[2].set_title("seasonality") - axs[3].plot(decomposition_results[0]["residual"], "y") + axs[3].plot(decomposition_results[id][0]["residual"], "y") axs[3].set_title("residual") - if self.is_multiseries: + if len(y.columns) > 1: fig.suptitle("Decomposition for Series {}".format(id)) plot_info[id] = (fig, axs) else: From 1768261a02f1a61aca45c6a5b2c987f7147cde01 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 7 Aug 2023 16:54:12 -0400 Subject: [PATCH 22/47] update get_trend_prediction_intervals and add multiseries tests --- .../preprocessing/stl_decomposer.py | 62 ++++--- .../decomposer_tests/test_decomposer.py | 169 +++++++++++++++++- .../decomposer_tests/test_stl_decomposer.py | 58 ++++-- 3 files changed, 235 insertions(+), 54 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index bda697bf04..f138dbd07a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -500,34 +500,40 @@ def get_trend_prediction_intervals(self, y, coverage=None): Returns: dict of pd.Series: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. """ - if coverage is None: - coverage = [0.95] + if isinstance(y, pd.Series): + y = y.to_frame() - self._check_oos_past(y) - alphas = [1 - val for val in coverage] - - if not self.forecast_summary or len(y) != len( - self.forecast_summary.predicted_mean, - ): - self._project_trend_and_seasonality(y) - - prediction_interval_result = {} - for i, alpha in enumerate(alphas): - result = self.forecast_summary.summary_frame(alpha=alpha) - overlapping_ind = [ind for ind in y.index if ind in result.index] - intervals = pd.DataFrame( - { - "lower": result["mean_ci_lower"] - result["mean"], - "upper": result["mean_ci_upper"] - result["mean"], - }, - ) - if len(overlapping_ind) > 0: # y.index is datetime - intervals = intervals.loc[overlapping_ind] - else: # y.index is not datetime (e.g. int) - intervals = intervals[-len(y) :] - intervals.index = y.index - prediction_interval_result[f"{coverage[i]}_lower"] = intervals["lower"] - prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] + for id in y.columns: + y_series = y[id] + + if coverage is None: + coverage = [0.95] + + self._check_oos_past(y_series) + alphas = [1 - val for val in coverage] + + if not self.forecast_summary or len(y_series) != len( + self.forecast_summary.predicted_mean, + ): + self._project_trend_and_seasonality(y_series) + + prediction_interval_result = {} + for i, alpha in enumerate(alphas): + result = self.forecast_summary.summary_frame(alpha=alpha) + overlapping_ind = [ind for ind in y_series.index if ind in result.index] + intervals = pd.DataFrame( + { + "lower": result["mean_ci_lower"] - result["mean"], + "upper": result["mean_ci_upper"] - result["mean"], + }, + ) + if len(overlapping_ind) > 0: # y.index is datetime + intervals = intervals.loc[overlapping_ind] + else: # y.index is not datetime (e.g. int) + intervals = intervals[-len(y_series) :] + intervals.index = y_series.index + prediction_interval_result[f"{coverage[i]}_lower"] = intervals["lower"] + prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] return prediction_interval_result @@ -560,6 +566,8 @@ def plot_decomposition( # Iterate through each series id plot_info = {} decomposition_results = self.get_trend_dataframe(X, y) + if isinstance(y, pd.Series): + y = y.to_frame() for id in y.columns: fig, axs = plt.subplots(4) diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index aaf924f626..09fee2edf5 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -61,11 +61,32 @@ def test_decomposer_init_raises_error_if_degree_not_int(decomposer_child_class): "y_has_time_index", ["y_has_time_index", "y_doesnt_have_time_index"], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_plot_decomposition( decomposer_child_class, y_has_time_index, generate_seasonal_data, + variateness, + multiseries_ts_data_unstacked, ): + if variateness == "univariate": + x = np.arange(0, 2 * np.pi, 0.01) + dts = pd.date_range(datetime.today(), periods=len(x)) + X = pd.DataFrame({"x": x}) + X = X.set_index(dts) + y = pd.Series(np.sin(x)) + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked step = 0.01 period = 9 X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) @@ -101,15 +122,34 @@ def test_decomposer_plot_decomposition( "time_index_is_specified_but_wrong", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_uses_time_index( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, X_has_time_index, X_num_time_columns, y_has_time_index, time_index_specified, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + X.index = X["date"] + y = y.set_axis(X.index) + X.ww.init() time_index_col_name = "date" assert isinstance(X.index, pd.DatetimeIndex) @@ -345,6 +385,7 @@ def test_decomposer_projected_seasonality_integer_and_datetime( }[test_first_index] X, _, y = ts_data() + datetime_index = pd.date_range(start="01-01-2002", periods=len(X), freq="M") if not has_freq: datetime_index.freq = None @@ -393,11 +434,33 @@ def test_decomposer_projected_seasonality_integer_and_datetime( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_get_trend_dataframe_raises_errors( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") + datetime_index = pd.DatetimeIndex(dts) + X.index = datetime_index + y.index = datetime_index + X["date"] = dts + dec = decomposer_child_class() dec.fit_transform(X, y) @@ -546,15 +609,33 @@ def test_decomposer_determine_periodicity_nullable_type_incompatibility( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) @pytest.mark.parametrize("fit_before_decompose", [True, False]) def test_decomposer_get_trend_dataframe_error_not_fit( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, fit_before_decompose, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + X.index = X["date"] + X.index.freq = "D" - dec = decomposer_child_class() + dec = decomposer_child_class(time_index="date") if fit_before_decompose: dec.fit_transform(X, y) dec.get_trend_dataframe(X, y) @@ -569,11 +650,28 @@ def test_decomposer_get_trend_dataframe_error_not_fit( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_transform_returns_same_when_y_none( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + dec = decomposer_child_class().fit(X, y) X_t, y_t = dec.transform(X, None) pd.testing.assert_frame_equal(X, X_t) @@ -584,11 +682,27 @@ def test_decomposer_transform_returns_same_when_y_none( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_raises_value_error_target_is_none( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked with pytest.raises(ValueError, match="cannot be None for Decomposer!"): decomposer_child_class(degree=3).fit_transform(X, None) @@ -606,11 +720,28 @@ def test_decomposer_raises_value_error_target_is_none( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_bad_target_index( decomposer_child_class, ts_data, + multiseries_ts_data_unstacked, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + dec = decomposer_child_class() y.index = pd.CategoricalIndex(["cat_index" for x in range(len(y))]) with pytest.raises( @@ -801,8 +932,28 @@ def test_decomposer_doesnt_modify_target_index( "decomposer_child_class", decomposer_list, ) -def test_decomposer_monthly_begin_data(decomposer_child_class, ts_data): - X, _, y = ts_data() +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_decomposer_monthly_begin_data( + decomposer_child_class, + ts_data, + multiseries_ts_data_unstacked, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = multiseries_ts_data_unstacked + dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") datetime_index = pd.DatetimeIndex(dts) X.index = datetime_index diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index afedf2c686..50e6bad47e 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -18,6 +18,18 @@ def test_stl_decomposer_init(): "period": None, "seasonal_smoother": 7, "time_index": "dates", + "series_id": None, + } + + +def test_stl_decomposer_multiseries_init(): + decomp = STLDecomposer(degree=3, time_index="dates", series_id="ids") + assert decomp.parameters == { + "degree": 3, + "period": None, + "seasonal_smoother": 7, + "time_index": "dates", + "series_id": "ids", } @@ -241,16 +253,20 @@ def test_stl_decomposer_get_trend_dataframe( subset_y = pd.concat([subset_y, subset_y], axis=1) result_dfs = dec.get_trend_dataframe(subset_X, subset_y) - - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) for x in result_dfs + ) if variateness == "univariate": - assert len(result_dfs) == 1 - [get_trend_dataframe_format_correct(x) for x in result_dfs] + assert len(result_dfs[0]) == 1 + [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] elif variateness == "multivariate": - assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(x) for idx, x in enumerate(result_dfs)] + assert len(result_dfs[0]) == 2 + [ + get_trend_dataframe_format_correct(x) + for idx, x in enumerate(result_dfs[0]) + ] elif transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -279,17 +295,21 @@ def test_stl_decomposer_get_trend_dataframe( else: result_dfs = dec.get_trend_dataframe(X.loc[y_t_new.index], y_t_new) - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) + for x in result_dfs + ) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs[0]) if variateness == "univariate": - assert len(result_dfs) == 1 - [get_trend_dataframe_format_correct(x) for x in result_dfs] + assert len(result_dfs[0]) == 1 + [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] elif variateness == "multivariate": - assert len(result_dfs) == 2 + assert len(result_dfs[0]) == 2 [ get_trend_dataframe_format_correct(x) - for idx, x in enumerate(result_dfs) + for idx, x in enumerate(result_dfs[0]) ] @@ -306,9 +326,11 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( stl.fit(X, y) result_dfs = stl.get_trend_dataframe(X, y) - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) for x in result_dfs + ) @pytest.mark.parametrize( From e3291959504551fc522fe4895116568b27f783a6 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 8 Aug 2023 09:53:26 -0400 Subject: [PATCH 23/47] update index in plotting instead --- .../transformers/preprocessing/stl_decomposer.py | 11 ++++++----- .../decomposer_tests/test_decomposer.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index f138dbd07a..088bce355e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -471,9 +471,7 @@ def _decompose_target(X, y, fh): "Provided DatetimeIndex of X should have an inferred frequency.", ) - # if it is multiseries, set the frequency and get values per series if len(y.columns) > 1: - X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) self.seasonality = self.decompositions[id]["seasonality"] self.seasonal = self.decompositions[id]["seasonal"] self.trend = self.decompositions[id]["trend"] @@ -563,12 +561,15 @@ def plot_decomposition( if isinstance(y, pd.Series): y = y.to_frame() - # Iterate through each series id - plot_info = {} - decomposition_results = self.get_trend_dataframe(X, y) if isinstance(y, pd.Series): y = y.to_frame() + plot_info = {} + if self.frequency and len(y.columns) > 1: + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) + decomposition_results = self.get_trend_dataframe(X, y) + + # Iterate through each series id for id in y.columns: fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index 09fee2edf5..ff820c3456 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -455,7 +455,7 @@ def test_decomposer_get_trend_dataframe_raises_errors( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) X, y = multiseries_ts_data_unstacked - dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") + dts = pd.date_range("01-01-2000", periods=len(X)) datetime_index = pd.DatetimeIndex(dts) X.index = datetime_index y.index = datetime_index From 61d1a18b13b694ab5a61af5b506d2c3b85d894d6 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 8 Aug 2023 16:37:47 -0400 Subject: [PATCH 24/47] add ms seasonal data --- .../preprocessing/stl_decomposer.py | 42 +++-- .../decomposer_tests/test_decomposer.py | 39 ++++- .../decomposer_tests/test_stl_decomposer.py | 150 ++++++++++++++---- evalml/tests/conftest.py | 95 +++++++++++ 4 files changed, 274 insertions(+), 52 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 088bce355e..d9d9d41c9e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -251,8 +251,6 @@ def transform( if isinstance(y, pd.Series): y = y.to_frame() - - features_list = [] detrending_list = [] # Iterate through each id group for id in y.columns: @@ -306,14 +304,12 @@ def transform( if len(y.columns) <= 1: return X, y_t - features_list.append({id: X}) - detrending_list.append({id: y_t}) + detrending_list.append(y_t) # Convert the list to a DataFrame # For multiseries, return tuple[pd.DataFrame, pd.Dataframe] where each column is a series_id - features_df = pd.DataFrame(features_list) - detrending_df = pd.DataFrame(detrending_list) - return features_df, detrending_df + detrending_df = pd.DataFrame(detrending_list).T + return X, detrending_df def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """Adds back fitted trend and seasonality to target variable. @@ -341,8 +337,7 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra if isinstance(y_t, pd.Series): y_t = y_t.to_frame() - y_in_sample_series = pd.Series([]) - y_out_of_sample_series = pd.Series([]) + y = [] for id in y_t.columns: y_in_sample = pd.Series([]) y_out_of_sample = pd.Series([]) @@ -373,7 +368,6 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra ) y_in_sample = series_y + trend + seasonal y_in_sample = y_in_sample.dropna() - y_in_sample_series = pd.concat([y_in_sample_series, y_in_sample]) # For out of sample data.... if series_y.index[-1] > index[-1]: @@ -394,13 +388,29 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra index=truncated_y_t.index, ), ) - y_out_of_sample_series = pd.concat( - [y_out_of_sample_series, y_out_of_sample], - ) # Corrected this line + y_series = pd.concat([y_in_sample, y_out_of_sample]) + + y.append(y_series) + y_df = pd.DataFrame(y).T + y_df.index = original_index + return y_df + + def fit_transform( + self, + X: pd.DataFrame, + y: pd.DataFrame = None, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Removes fitted trend and seasonality from target variable. + + Args: + X (pd.DataFrame, optional): Ignored. + y (pd.Series): Target variable to detrend and deseasonalize. - y = pd.concat([y_in_sample_series, y_out_of_sample_series]) - y.index = original_index - return y + Returns: + tuple of pd.DataFrame, pd.Series: The first element are the input features returned without modification. + The second element is the target variable y with the fitted trend removed. + """ + return self.fit(X, y).transform(X, y) def get_trend_dataframe(self, X, y): """Return a list of dataframes with 4 columns: signal, trend, seasonality, residual. diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index ff820c3456..50e6278d92 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -767,21 +767,46 @@ def test_decomposer_bad_target_index( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_fit_transform_out_of_sample( decomposer_child_class, + variateness, + generate_multiseries_seasonal_data, generate_seasonal_data, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + subset_y = y[2 * period : 7 * period] + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + subset_y = [] + for id in y.columns: + subset_y.append(y[id][2 * period : 7 * period]) + subset_y = pd.DataFrame(subset_y) subset_X = X[2 * period : 7 * period] - subset_y = y[2 * period : 7 * period] decomposer = decomposer_child_class(period=period) decomposer.fit(subset_X, subset_y) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 50e6bad47e..f836257ed9 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -41,8 +41,23 @@ def test_stl_decomposer_auto_sets_seasonal_smoother_to_odd(): assert stl.seasonal_smoother == 5 -def test_stl_raises_warning_high_smoother(caplog, ts_data): - X, _, y = ts_data() +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_raises_warning_high_smoother( + caplog, + ts_data, + multiseries_ts_data_unstacked, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + X, y = multiseries_ts_data_unstacked stl = STLDecomposer(seasonal_smoother=101) stl.fit(X, y) assert "STLDecomposer may perform poorly" in caplog.text @@ -91,17 +106,33 @@ def test_stl_sets_determined_period( ], ) @pytest.mark.parametrize("trend_degree", [1, 2, 3]) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_fit_transform_in_sample( period, freq, trend_degree, generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period, - freq_str=freq, - trend_degree=trend_degree, - ) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period, + freq_str=freq, + trend_degree=trend_degree, + ) + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period, + freq_str=freq, + trend_degree=trend_degree, + ) # Get the expected answer lin_reg = LinearRegression(fit_intercept=True) @@ -115,15 +146,29 @@ def test_stl_fit_transform_in_sample( X_t, y_t = stl.fit_transform(X, y) - # Check to make sure STL detrended/deseasoned - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t))), - y_t, - check_exact=False, - check_index=False, - check_names=False, - atol=0.1, - ) + if variateness == "univariate": + # Check to make sure STL detrended/deseasoned + pd.testing.assert_series_equal( + pd.Series(np.zeros(len(y_t))), + y_t, + check_exact=False, + check_index=False, + check_names=False, + atol=0.1, + ) + elif variateness == "multivariate": + pd.testing.assert_series_equal( + pd.DataFrame( + np.zeros((len(y_t), len(y_t.columns))), + columns=y_t.columns, + index=y_t.index, + ), + y_t, + check_exact=False, + check_index=False, + check_names=False, + atol=0.1, + ) # Check the trend to make sure STL worked properly pd.testing.assert_series_equal( @@ -231,6 +276,7 @@ def test_stl_decomposer_inverse_transform( @pytest.mark.parametrize("fit_before_decompose", [True, False]) def test_stl_decomposer_get_trend_dataframe( generate_seasonal_data, + generate_multiseries_seasonal_data, transformer_fit_on_data, fit_before_decompose, variateness, @@ -313,13 +359,29 @@ def test_stl_decomposer_get_trend_dataframe( ] +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=7, + set_time_index=False, + ) + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=7, + set_time_index=False, + ) + assert not isinstance(y.index, pd.DatetimeIndex) stl = STLDecomposer() @@ -337,30 +399,60 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( "bad_frequency", ["T", "A"], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_unsupported_frequencies( bad_frequency, generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, ): """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - freq_str=bad_frequency, - ) - + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=7, + freq_str=bad_frequency, + ) + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=7, + freq_str=bad_frequency, + ) stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) assert stl.period is not None +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_doesnt_modify_target_index( generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=7, + set_time_index=False, + ) + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=7, + set_time_index=False, + ) + original_X_index = X.index original_y_index = y.index diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 1882f7c05e..c534ddf636 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2516,3 +2516,98 @@ def _split_nullable_logical_types_by_compatibility( return compatible_ltypes, incompatible_ltypes return _split_nullable_logical_types_by_compatibility + + +@pytest.fixture +def generate_multiseries_seasonal_data(): + """Function that returns data with a linear trend and a seasonal signal with specified period for multiseries.""" + + def generate_real_data( + period, + step=None, + num_periods=20, + scale=1, + seasonal_scale=1, + trend_degree=1, + freq_str="D", + set_time_index=False, + ): + X, y = load_weather() + y = y.set_axis(X["Date"]).asfreq(pd.infer_freq(X["Date"])) + y_ms = pd.DataFrame({f"target_{i}": y - i for i in range(2)}) + X = X.set_index("Date").asfreq(pd.infer_freq(X["Date"])) + return X, y_ms + + def generate_synthetic_data( + period, + step=None, + num_periods=20, + scale=1, + seasonal_scale=1, + trend_degree=1, + freq_str="D", + set_time_index=False, + ): + """Function to generate a sinusoidal signal with a polynomial trend. + + Args: + period: The length, in units, of the seasonal signal. + step: + num_periods: How many periods of the seasonal signal to generate. + scale: The relative scale of the trend. Setting it higher increases + the comparative strength of the trend. + seasonal_scale: The relative scale of the sinusoidal seasonality. + Setting it higher increases the comparative strength of the + trend. + trend_degree: The degree of the polynomial trend. 1 = linear, 2 = + quadratic, 3 = cubic. Specific functional forms defined + below. + freq_str: The pandas frequency string used to define the unit of + time in the series time index. + set_time_index: Whether to set the time index with a pandas. + DatetimeIndex. + + Returns: + X (pandas.DateFrame): A placeholder feature matrix. + y (pandas.Series): A synthetic, time series target Series. + + """ + if period is None: + x = np.arange(0, 1, 0.01) + elif step is not None: + freq = 2 * np.pi / period / step + x = np.arange(0, 1, step) + else: + freq = 2 * np.pi / period + x = np.arange(0, period * num_periods, 1) + dts = pd.date_range(datetime.today(), periods=len(x), freq=freq_str) + X = pd.DataFrame({"x": x}) + X = X.set_index(dts) + + y_ms_list = [] + for i in range(2): + for j in range(5): + if trend_degree == 1: + y_trend = pd.Series(scale * minmax_scale(x + 2)) + elif trend_degree == 2: + y_trend = pd.Series(scale * minmax_scale(x**2)) + elif trend_degree == 3: + y_trend = pd.Series(scale * minmax_scale((x - 5) ** 3 + x**2)) + if period is not None: + y_seasonal = pd.Series(seasonal_scale * np.sin(freq * x)) + else: + y_seasonal = pd.Series(np.zeros(len(x))) + y = y_trend + y_seasonal - i + if set_time_index: + y = y.set_axis(dts) + y_ms_list.append(y) + y_ms = pd.DataFrame(y_ms_list).T + return X, y_ms + + def _return_proper_func(real_or_synthetic): + if real_or_synthetic == "synthetic": + return generate_synthetic_data + elif real_or_synthetic == "real": + return generate_real_data + + return _return_proper_func From d3c94687f57428e0a1fcce80227113dcb54abaac Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 11 Aug 2023 11:15:46 -0400 Subject: [PATCH 25/47] subset test remaining --- .../decomposer_tests/test_stl_decomposer.py | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index f836257ed9..5eb50e1050 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -134,19 +134,19 @@ def test_stl_fit_transform_in_sample( trend_degree=trend_degree, ) - # Get the expected answer - lin_reg = LinearRegression(fit_intercept=True) - features = PolynomialFeatures(degree=trend_degree).fit_transform( - np.arange(X.shape[0]).reshape(-1, 1), - ) - lin_reg.fit(features, y) - expected_trend = lin_reg.predict(features) - stl = STLDecomposer(period=period) X_t, y_t = stl.fit_transform(X, y) if variateness == "univariate": + # Get the expected answer + lin_reg = LinearRegression(fit_intercept=True) + features = PolynomialFeatures(degree=trend_degree).fit_transform( + np.arange(X.shape[0]).reshape(-1, 1), + ) + lin_reg.fit(features, y) + expected_trend = lin_reg.predict(features) + # Check to make sure STL detrended/deseasoned pd.testing.assert_series_equal( pd.Series(np.zeros(len(y_t))), @@ -156,29 +156,34 @@ def test_stl_fit_transform_in_sample( check_names=False, atol=0.1, ) - elif variateness == "multivariate": + # Check the trend to make sure STL worked properly pd.testing.assert_series_equal( - pd.DataFrame( - np.zeros((len(y_t), len(y_t.columns))), - columns=y_t.columns, - index=y_t.index, - ), - y_t, + pd.Series(expected_trend), + pd.Series(stl.trend), check_exact=False, check_index=False, check_names=False, - atol=0.1, + atol=0.3, ) - - # Check the trend to make sure STL worked properly - pd.testing.assert_series_equal( - pd.Series(expected_trend), - pd.Series(stl.trend), - check_exact=False, - check_index=False, - check_names=False, - atol=0.3, - ) + elif variateness == "multivariate": + # Get the expected answer + for id in y.columns: + y_series = y[id] + lin_reg = LinearRegression(fit_intercept=True) + features = PolynomialFeatures(degree=trend_degree).fit_transform( + np.arange(X.shape[0]).reshape(-1, 1), + ) + lin_reg.fit(features, y_series) + expected_trend = lin_reg.predict(features) + # Check the trend to make sure STL worked properly + pd.testing.assert_series_equal( + pd.Series(expected_trend), + pd.Series(stl.decompositions[id]["trend"]), + check_exact=False, + check_index=False, + check_names=False, + atol=0.3, + ) # Verify the X is not changed pd.testing.assert_frame_equal(X, X_t) From 58cd094a3ae84c784d55204c9e672ae17541c652 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Sun, 13 Aug 2023 20:26:03 -0700 Subject: [PATCH 26/47] add multiseries tests --- .../preprocessing/stl_decomposer.py | 34 ++-- .../decomposer_tests/test_decomposer.py | 130 ++++++++++----- .../decomposer_tests/test_stl_decomposer.py | 148 ++++++++++++------ evalml/tests/conftest.py | 114 +++++++------- 4 files changed, 267 insertions(+), 159 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index d9d9d41c9e..2f1199104e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -389,29 +389,16 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra ), ) y_series = pd.concat([y_in_sample, y_out_of_sample]) + # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] + if len(y_t.columns) <= 1: + y_series.index = original_index + return y_series - y.append(y_series) + y.append(y_series) y_df = pd.DataFrame(y).T y_df.index = original_index return y_df - def fit_transform( - self, - X: pd.DataFrame, - y: pd.DataFrame = None, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Removes fitted trend and seasonality from target variable. - - Args: - X (pd.DataFrame, optional): Ignored. - y (pd.Series): Target variable to detrend and deseasonalize. - - Returns: - tuple of pd.DataFrame, pd.Series: The first element are the input features returned without modification. - The second element is the target variable y with the fitted trend removed. - """ - return self.fit(X, y).transform(X, y) - def get_trend_dataframe(self, X, y): """Return a list of dataframes with 4 columns: signal, trend, seasonality, residual. @@ -495,6 +482,10 @@ def _decompose_target(X, y, fh): for colname in series_y.columns: result_dfs.append(_decompose_target(X, series_y[colname], None)) series_results[id] = result_dfs + + # only return the dictionary if single series + if len(y.columns) <= 1: + return result_dfs return series_results def get_trend_prediction_intervals(self, y, coverage=None): @@ -511,6 +502,7 @@ def get_trend_prediction_intervals(self, y, coverage=None): if isinstance(y, pd.Series): y = y.to_frame() + series_results = {} for id in y.columns: y_series = y[id] @@ -542,8 +534,12 @@ def get_trend_prediction_intervals(self, y, coverage=None): intervals.index = y_series.index prediction_interval_result[f"{coverage[i]}_lower"] = intervals["lower"] prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] + series_results[id] = prediction_interval_result - return prediction_interval_result + # only return the dictionary if single series + if len(y.columns) <= 1: + return prediction_interval_result + return series_results # Overload the plot_decomposition fucntion to be able to plot multiple decompositions for multiseries def plot_decomposition( diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index 50e6278d92..d8bd48ce9d 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -802,24 +802,23 @@ def test_decomposer_fit_transform_out_of_sample( set_time_index=True, seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend ) - subset_y = [] - for id in y.columns: - subset_y.append(y[id][2 * period : 7 * period]) - subset_y = pd.DataFrame(subset_y) + subset_y = y.loc[y.index[2 * period : 7 * period]] + subset_X = X[2 * period : 7 * period] decomposer = decomposer_child_class(period=period) decomposer.fit(subset_X, subset_y) if transformer_fit_on_data == "in-sample": - output_X, output_y = decomposer.transform(subset_X, subset_y) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index), - output_y, - check_dtype=False, - check_names=False, - atol=0.2, - ) + if variateness == "univariate": + output_X, output_y = decomposer.transform(subset_X, subset_y) + pd.testing.assert_series_equal( + pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index), + output_y, + check_dtype=False, + check_names=False, + atol=0.2, + ) if transformer_fit_on_data != "in-sample": y_new = build_test_target( @@ -838,14 +837,23 @@ def test_decomposer_fit_transform_out_of_sample( ): output_X, output_inverse_y = decomposer.transform(None, y_new) else: - output_X, output_y_t = decomposer.transform(None, y[y_new.index]) - - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), - output_y_t, - check_exact=False, - atol=0.1, # STLDecomposer is within atol=5.0e-4 - ) + if variateness == "univariate": + output_X, output_y_t = decomposer.transform(None, y[y_new.index]) + pd.testing.assert_series_equal( + pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), + output_y_t, + check_exact=False, + atol=0.1, # STLDecomposer is within atol=5.0e-4 + ) + elif variateness == "mulivariate": + y_new = pd.DataFrame([y_new, y_new]).T + output_X, output_y_t = decomposer.transform(None, y[y_new.index]) + pd.testing.assert_frame_equal( + pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), + output_y_t, + check_exact=False, + atol=0.1, # STLDecomposer is within atol=5.0e-4 + ) @pytest.mark.parametrize( @@ -865,31 +873,67 @@ def test_decomposer_fit_transform_out_of_sample( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_inverse_transform( decomposer_child_class, index_type, generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) + subset_y = y[: 5 * period] + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) + subset_y = y.loc[y.index[: 5 * period]] + subset_X = X[: 5 * period] - subset_y = y[: 5 * period] decomposer = decomposer_child_class(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - pd.testing.assert_series_equal(subset_y, output_inverse_y, check_dtype=False) + if isinstance(decomposer, STLDecomposer): + pd.testing.assert_frame_equal( + pd.DataFrame(subset_y), + output_inverse_y, + check_dtype=False, + ) + elif isinstance(decomposer, PolynomialDecomposer): + pd.testing.assert_series_equal( + pd.Series(subset_y), + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -898,6 +942,8 @@ def test_decomposer_inverse_transform( transformer_fit_on_data, to_test="inverse_transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T if transformer_fit_on_data in [ "out-of-sample-in-past", "partially-out-of-sample-in-past", @@ -911,15 +957,23 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. - pd.testing.assert_series_equal( - y[y_t_new.index], - output_inverse_y, - check_exact=False, - check_index=False, - rtol=1.0e-1, - ) + if isinstance(decomposer, STLDecomposer): + pd.testing.assert_frame_equal( + pd.DataFrame(y.loc[y_t_new.index]), + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) + elif isinstance(decomposer, PolynomialDecomposer): + pd.testing.assert_series_equal( + pd.Series(y[y_t_new.index]), + output_inverse_y, + check_exact=False, + check_index=False, + rtol=1.0e-1, + ) pd.testing.assert_index_equal( - y[y_t_new.index].index, + y.loc[y_t_new.index].index, output_inverse_y.index, exact=False, ) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 5eb50e1050..bb9ffdae86 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -202,29 +202,53 @@ def test_stl_fit_transform_in_sample( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_inverse_transform( index_type, generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_X = X[: 5 * period] - subset_y = y[: 5 * period] + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) + subset_X = X[: 5 * period] + subset_y = y[: 5 * period] + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) + subset_y = y.loc[y.index[: 5 * period]] + subset_X = X[: 5 * period] decomposer = STLDecomposer(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - pd.testing.assert_series_equal(subset_y, output_inverse_y, check_dtype=False) + pd.testing.assert_frame_equal( + pd.DataFrame(subset_y), + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -233,6 +257,8 @@ def test_stl_decomposer_inverse_transform( transformer_fit_on_data, to_test="inverse_transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T if transformer_fit_on_data in [ "out-of-sample-in-past", "partially-out-of-sample-in-past", @@ -246,14 +272,14 @@ def test_stl_decomposer_inverse_transform( # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) - pd.testing.assert_series_equal( - y[y_t_new.index], + pd.testing.assert_frame_equal( + pd.DataFrame(y.loc[y_t_new.index]), output_inverse_y, - check_index=False, - rtol=1.0e-2, + check_exact=False, + rtol=1.0e-1, ) pd.testing.assert_index_equal( - y[y_t_new.index].index, + y.loc[y_t_new.index].index, output_inverse_y.index, exact=False, ) @@ -287,36 +313,46 @@ def test_stl_decomposer_get_trend_dataframe( variateness, ): period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) + + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + subset_y = y[: 5 * period] + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + subset_y = y.loc[y.index[: 5 * period]] + subset_X = X[: 5 * period] - subset_y = y[: 5 * period] if transformer_fit_on_data == "in-sample": dec = STLDecomposer() dec.fit(subset_X, subset_y) # get_trend_dataframe() is only expected to work with datetime indices - if variateness == "multivariate": - subset_y = pd.concat([subset_y, subset_y], axis=1) result_dfs = dec.get_trend_dataframe(subset_X, subset_y) assert isinstance(result_dfs, dict) assert all(isinstance(result_dfs[x], list) for x in result_dfs) assert all( - all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) for x in result_dfs + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs ) if variateness == "univariate": - assert len(result_dfs[0]) == 1 + assert len(result_dfs) == 1 [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] + elif variateness == "multivariate": - assert len(result_dfs[0]) == 2 + assert len(result_dfs) == 2 [ - get_trend_dataframe_format_correct(x) - for idx, x in enumerate(result_dfs[0]) + (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) + for df in result_dfs ] elif transformer_fit_on_data != "in-sample": @@ -326,12 +362,12 @@ def test_stl_decomposer_get_trend_dataframe( transformer_fit_on_data, to_test="transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T dec = STLDecomposer() dec.fit(subset_X, subset_y) # get_trend_dataframe() is only expected to work with datetime indices - if variateness == "multivariate": - y_t_new = pd.concat([y_t_new, y_t_new], axis=1) if transformer_fit_on_data in [ "out-of-sample-in-past", @@ -349,18 +385,18 @@ def test_stl_decomposer_get_trend_dataframe( assert isinstance(result_dfs, dict) assert all(isinstance(result_dfs[x], list) for x in result_dfs) assert all( - all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) - for x in result_dfs + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs ) assert all(get_trend_dataframe_format_correct(x) for x in result_dfs[0]) if variateness == "univariate": - assert len(result_dfs[0]) == 1 + assert len(result_dfs) == 1 [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] elif variateness == "multivariate": - assert len(result_dfs[0]) == 2 + assert len(result_dfs) == 2 [ - get_trend_dataframe_format_correct(x) - for idx, x in enumerate(result_dfs[0]) + (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) + for df in result_dfs ] @@ -476,22 +512,39 @@ def test_stl_decomposer_doesnt_modify_target_index( @pytest.mark.parametrize("index_type", ["datetime", "int"]) @pytest.mark.parametrize("set_coverage", [True, False]) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_get_trend_prediction_intervals( set_coverage, index_type, generate_seasonal_data, + generate_multiseries_seasonal_data, + variateness, ): coverage = [0.75, 0.85, 0.95] if set_coverage else None period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) + if variateness == "univariate": + X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + y_train = y[: 15 * period] + y_validate = y[15 * period :] + elif variateness == "multivariate": + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period=period, + freq_str="D", + set_time_index=True, + ) + y_train = y.loc[y.index[: 15 * period]] + y_validate = y.loc[y.index[15 * period :]] X_train = X[: 15 * period] - y_train = y[: 15 * period] - - y_validate = y[15 * period :] stl = STLDecomposer() stl.fit(X_train, y_train) @@ -513,4 +566,9 @@ def assert_pred_interval_coverage(pred_interval): y_validate, coverage=coverage, ) - assert_pred_interval_coverage(trend_pred_intervals) + + if isinstance(y_validate, pd.Series): + y_validate = pd.DataFrame(y_validate) + + for id in y_validate: + assert_pred_interval_coverage(trend_pred_intervals[id]) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index c534ddf636..a801e99ced 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2462,62 +2462,6 @@ def _return_proper_func(real_or_synthetic): return _return_proper_func -@pytest.fixture -def categorical_floats_df(): - X = pd.DataFrame( - { - "double_int_cats": pd.Series([1.0, 2.0, 3.0, 4.0, 5.0] * 20), - "string_cats": pd.Series(["a", "b", "c", "d", "e"] * 20), - "int_cats": pd.Series([1, 2, 3, 4, 5] * 20), - "int_col": pd.Series([1, 2, 3, 4, 5] * 20), - "double_col": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20), - }, - ) - X.ww.init( - logical_types={ - "double_int_cats": "Categorical", - "string_cats": "Categorical", - "int_cats": "Categorical", - "int_col": "Integer", - "double_col": "Double", - }, - ) - - return X - - -@pytest.fixture -def get_black_config(): - current_dir = os.path.dirname(os.path.abspath(__file__)) - evalml_path = os.path.abspath(os.path.join(current_dir, "..", "..")) - black_config = get_evalml_black_config(evalml_path) - return black_config - - -@pytest.fixture -def split_nullable_logical_types_by_compatibility(): - def _split_nullable_logical_types_by_compatibility( - int_null_incompatible, - bool_null_incompatible, - ): - incompatible_ltypes = [] - compatible_ltypes = [] - if int_null_incompatible: - incompatible_ltypes.append(IntegerNullable) - incompatible_ltypes.append(AgeNullable) - else: - compatible_ltypes.append(IntegerNullable) - compatible_ltypes.append(AgeNullable) - if bool_null_incompatible: - incompatible_ltypes.append(BooleanNullable) - else: - compatible_ltypes.append(BooleanNullable) - - return compatible_ltypes, incompatible_ltypes - - return _split_nullable_logical_types_by_compatibility - - @pytest.fixture def generate_multiseries_seasonal_data(): """Function that returns data with a linear trend and a seasonal signal with specified period for multiseries.""" @@ -2597,7 +2541,7 @@ def generate_synthetic_data( y_seasonal = pd.Series(seasonal_scale * np.sin(freq * x)) else: y_seasonal = pd.Series(np.zeros(len(x))) - y = y_trend + y_seasonal - i + y = y_trend + y_seasonal if set_time_index: y = y.set_axis(dts) y_ms_list.append(y) @@ -2611,3 +2555,59 @@ def _return_proper_func(real_or_synthetic): return generate_real_data return _return_proper_func + + +@pytest.fixture +def categorical_floats_df(): + X = pd.DataFrame( + { + "double_int_cats": pd.Series([1.0, 2.0, 3.0, 4.0, 5.0] * 20), + "string_cats": pd.Series(["a", "b", "c", "d", "e"] * 20), + "int_cats": pd.Series([1, 2, 3, 4, 5] * 20), + "int_col": pd.Series([1, 2, 3, 4, 5] * 20), + "double_col": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20), + }, + ) + X.ww.init( + logical_types={ + "double_int_cats": "Categorical", + "string_cats": "Categorical", + "int_cats": "Categorical", + "int_col": "Integer", + "double_col": "Double", + }, + ) + + return X + + +@pytest.fixture +def get_black_config(): + current_dir = os.path.dirname(os.path.abspath(__file__)) + evalml_path = os.path.abspath(os.path.join(current_dir, "..", "..")) + black_config = get_evalml_black_config(evalml_path) + return black_config + + +@pytest.fixture +def split_nullable_logical_types_by_compatibility(): + def _split_nullable_logical_types_by_compatibility( + int_null_incompatible, + bool_null_incompatible, + ): + incompatible_ltypes = [] + compatible_ltypes = [] + if int_null_incompatible: + incompatible_ltypes.append(IntegerNullable) + incompatible_ltypes.append(AgeNullable) + else: + compatible_ltypes.append(IntegerNullable) + compatible_ltypes.append(AgeNullable) + if bool_null_incompatible: + incompatible_ltypes.append(BooleanNullable) + else: + compatible_ltypes.append(BooleanNullable) + + return compatible_ltypes, incompatible_ltypes + + return _split_nullable_logical_types_by_compatibility From b68fda8f5f965f5cd182854ea6a3ada055c5d0f9 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 14 Aug 2023 09:22:28 -0700 Subject: [PATCH 27/47] fix univariate tests --- .../preprocessing/stl_decomposer.py | 11 +- .../decomposer_tests/test_decomposer.py | 16 ++- .../decomposer_tests/test_stl_decomposer.py | 104 ++++++++++++------ 3 files changed, 87 insertions(+), 44 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 2f1199104e..ee4979d736 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -579,13 +579,16 @@ def plot_decomposition( for id in y.columns: fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) - axs[0].plot(decomposition_results[id][0]["signal"], "r") + + if len(y.columns) > 1: + decomposition_results = decomposition_results[id] + axs[0].plot(decomposition_results[0]["signal"], "r") axs[0].set_title("signal") - axs[1].plot(decomposition_results[id][0]["trend"], "b") + axs[1].plot(decomposition_results[0]["trend"], "b") axs[1].set_title("trend") - axs[2].plot(decomposition_results[id][0]["seasonality"], "g") + axs[2].plot(decomposition_results[0]["seasonality"], "g") axs[2].set_title("seasonality") - axs[3].plot(decomposition_results[id][0]["residual"], "y") + axs[3].plot(decomposition_results[0]["residual"], "y") axs[3].set_title("residual") if len(y.columns) > 1: diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index d8bd48ce9d..50e76f9ce6 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -922,13 +922,17 @@ def test_decomposer_inverse_transform( if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - if isinstance(decomposer, STLDecomposer): + if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": pd.testing.assert_frame_equal( pd.DataFrame(subset_y), output_inverse_y, check_dtype=False, ) - elif isinstance(decomposer, PolynomialDecomposer): + elif ( + isinstance(decomposer, PolynomialDecomposer) + or isinstance(decomposer, STLDecomposer) + and variateness == "univariate" + ): pd.testing.assert_series_equal( pd.Series(subset_y), output_inverse_y, @@ -957,14 +961,18 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. - if isinstance(decomposer, STLDecomposer): + if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": pd.testing.assert_frame_equal( pd.DataFrame(y.loc[y_t_new.index]), output_inverse_y, check_exact=False, rtol=1.0e-1, ) - elif isinstance(decomposer, PolynomialDecomposer): + elif ( + isinstance(decomposer, PolynomialDecomposer) + or isinstance(decomposer, STLDecomposer) + and variateness == "univariate" + ): pd.testing.assert_series_equal( pd.Series(y[y_t_new.index]), output_inverse_y, diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index bb9ffdae86..e25f571232 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -244,11 +244,18 @@ def test_stl_decomposer_inverse_transform( if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - pd.testing.assert_frame_equal( - pd.DataFrame(subset_y), - output_inverse_y, - check_dtype=False, - ) + if variateness == "univariate": + pd.testing.assert_series_equal( + subset_y, + output_inverse_y, + check_dtype=False, + ) + elif variateness == "mulitvariate": + pd.testing.assert_frame_equal( + pd.DataFrame(subset_y), + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -272,12 +279,21 @@ def test_stl_decomposer_inverse_transform( # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) - pd.testing.assert_frame_equal( - pd.DataFrame(y.loc[y_t_new.index]), - output_inverse_y, - check_exact=False, - rtol=1.0e-1, - ) + + if variateness == "univariate": + pd.testing.assert_series_equal( + y[y_t_new.index], + output_inverse_y, + check_index=False, + rtol=1.0e-2, + ) + elif variateness == "mulitvariate": + pd.testing.assert_frame_equal( + pd.DataFrame(y.loc[y_t_new.index]), + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) pd.testing.assert_index_equal( y.loc[y_t_new.index].index, output_inverse_y.index, @@ -338,17 +354,21 @@ def test_stl_decomposer_get_trend_dataframe( # get_trend_dataframe() is only expected to work with datetime indices result_dfs = dec.get_trend_dataframe(subset_X, subset_y) - assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) - assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs - ) + if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) assert len(result_dfs) == 1 - [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] + [get_trend_dataframe_format_correct(x) for x in result_dfs] elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs + ) assert len(result_dfs) == 2 [ (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) @@ -382,17 +402,23 @@ def test_stl_decomposer_get_trend_dataframe( else: result_dfs = dec.get_trend_dataframe(X.loc[y_t_new.index], y_t_new) - assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) - assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs - ) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs[0]) if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) assert len(result_dfs) == 1 - [get_trend_dataframe_format_correct(x) for x in result_dfs[0]] + [get_trend_dataframe_format_correct(x) for x in result_dfs] elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs + ) + assert all( + (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) + for df in result_dfs + ) assert len(result_dfs) == 2 [ (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) @@ -429,11 +455,17 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( stl.fit(X, y) result_dfs = stl.get_trend_dataframe(X, y) - assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) - assert all( - all(isinstance(y, pd.DataFrame) for y in result_dfs[x]) for x in result_dfs - ) + if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + elif variateness == "mulitvariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs + ) @pytest.mark.parametrize( @@ -567,8 +599,8 @@ def assert_pred_interval_coverage(pred_interval): coverage=coverage, ) - if isinstance(y_validate, pd.Series): - y_validate = pd.DataFrame(y_validate) - - for id in y_validate: - assert_pred_interval_coverage(trend_pred_intervals[id]) + if variateness == "univariate": + assert_pred_interval_coverage(trend_pred_intervals) + elif variateness == "multivariate": + for id in y_validate: + assert_pred_interval_coverage(trend_pred_intervals[id]) From 95d261913aeb63ca296d6ed1805a9fcd6dfa01b4 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 14 Aug 2023 12:35:34 -0700 Subject: [PATCH 28/47] fix codecov --- .../preprocessing/stl_decomposer.py | 30 ++++++------- .../decomposer_tests/test_decomposer.py | 43 +++++++++++-------- .../decomposer_tests/test_stl_decomposer.py | 6 +-- evalml/tests/conftest.py | 28 +----------- 4 files changed, 46 insertions(+), 61 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index ee4979d736..2f6a92eef6 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -262,8 +262,6 @@ def transform( self.residual = self.decompositions[id]["residual"] self.period = self.decompositions[id]["period"] - if series_y is None: - return X, series_y original_index = series_y.index X, series_y = self._check_target(X, series_y) self._check_oos_past(series_y) @@ -478,9 +476,7 @@ def _decompose_target(X, y, fh): series_y = y[id] if isinstance(series_y, pd.Series): result_dfs.append(_decompose_target(X, series_y, None)) - elif isinstance(series_y, pd.DataFrame): - for colname in series_y.columns: - result_dfs.append(_decompose_target(X, series_y[colname], None)) + series_results[id] = result_dfs # only return the dictionary if single series @@ -567,12 +563,10 @@ def plot_decomposition( if isinstance(y, pd.Series): y = y.to_frame() - if isinstance(y, pd.Series): - y = y.to_frame() - plot_info = {} - if self.frequency and len(y.columns) > 1: - X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) + if self.frequency and self.time_index and len(y.columns) > 1: + if isinstance(X.index, int): + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) decomposition_results = self.get_trend_dataframe(X, y) # Iterate through each series id @@ -580,17 +574,23 @@ def plot_decomposition( fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) + for ax in axs: + ax.cla() + if len(y.columns) > 1: - decomposition_results = decomposition_results[id] - axs[0].plot(decomposition_results[0]["signal"], "r") + results = decomposition_results[id] + else: + results = decomposition_results + axs[0].plot(results[0]["signal"], "r") axs[0].set_title("signal") - axs[1].plot(decomposition_results[0]["trend"], "b") + axs[1].plot(results[0]["trend"], "b") axs[1].set_title("trend") - axs[2].plot(decomposition_results[0]["seasonality"], "g") + axs[2].plot(results[0]["seasonality"], "g") axs[2].set_title("seasonality") - axs[3].plot(decomposition_results[0]["residual"], "y") + axs[3].plot(results[0]["residual"], "y") axs[3].set_title("residual") + # If multiseries, return a dictionary of tuples if len(y.columns) > 1: fig.suptitle("Decomposition for Series {}".format(id)) plot_info[id] = (fig, axs) diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index 50e76f9ce6..9e5435c0ec 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -72,33 +72,40 @@ def test_decomposer_plot_decomposition( decomposer_child_class, y_has_time_index, generate_seasonal_data, + generate_multiseries_seasonal_data, variateness, - multiseries_ts_data_unstacked, ): + step = 0.01 + period = 9 if variateness == "univariate": - x = np.arange(0, 2 * np.pi, 0.01) - dts = pd.date_range(datetime.today(), periods=len(x)) - X = pd.DataFrame({"x": x}) - X = X.set_index(dts) - y = pd.Series(np.sin(x)) + X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) elif variateness == "multivariate": if isinstance(decomposer_child_class(), PolynomialDecomposer): pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - step = 0.01 - period = 9 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) + X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( + period, + step, + ) if y_has_time_index == "y_has_time_index": y = y.set_axis(X.index) dec = decomposer_child_class(degree=1, period=period) dec.fit_transform(X, y) - fig, axs = dec.plot_decomposition(X, y, show=False) - assert isinstance(fig, matplotlib.pyplot.Figure) - assert isinstance(axs, np.ndarray) - assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + + if variateness == "univariate": + fig, axs = dec.plot_decomposition(X, y, show=False) + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + elif variateness == "multivariate": + result_plots = dec.plot_decomposition(X, y, show=False) + for id in y.columns: + fig, axs = result_plots[id] + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) @pytest.mark.parametrize( @@ -845,11 +852,13 @@ def test_decomposer_fit_transform_out_of_sample( check_exact=False, atol=0.1, # STLDecomposer is within atol=5.0e-4 ) - elif variateness == "mulivariate": + elif variateness == "multivariate": y_new = pd.DataFrame([y_new, y_new]).T - output_X, output_y_t = decomposer.transform(None, y[y_new.index]) + output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) pd.testing.assert_frame_equal( - pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), + pd.DataFrame( + [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], + ).T.set_axis(y_new.index), output_y_t, check_exact=False, atol=0.1, # STLDecomposer is within atol=5.0e-4 diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index e25f571232..7c71b5c8f2 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -250,7 +250,7 @@ def test_stl_decomposer_inverse_transform( output_inverse_y, check_dtype=False, ) - elif variateness == "mulitvariate": + elif variateness == "multivariate": pd.testing.assert_frame_equal( pd.DataFrame(subset_y), output_inverse_y, @@ -287,7 +287,7 @@ def test_stl_decomposer_inverse_transform( check_index=False, rtol=1.0e-2, ) - elif variateness == "mulitvariate": + elif variateness == "multivariate": pd.testing.assert_frame_equal( pd.DataFrame(y.loc[y_t_new.index]), output_inverse_y, @@ -459,7 +459,7 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( assert isinstance(result_dfs, list) assert all(isinstance(x, pd.DataFrame) for x in result_dfs) assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) - elif variateness == "mulitvariate": + elif variateness == "multivariate": assert isinstance(result_dfs, dict) assert all(isinstance(result_dfs[x], list) for x in result_dfs) assert all( diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 1fedba1a82..6ee1c9d631 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2533,22 +2533,6 @@ def _return_proper_func(real_or_synthetic): def generate_multiseries_seasonal_data(): """Function that returns data with a linear trend and a seasonal signal with specified period for multiseries.""" - def generate_real_data( - period, - step=None, - num_periods=20, - scale=1, - seasonal_scale=1, - trend_degree=1, - freq_str="D", - set_time_index=False, - ): - X, y = load_weather() - y = y.set_axis(X["Date"]).asfreq(pd.infer_freq(X["Date"])) - y_ms = pd.DataFrame({f"target_{i}": y - i for i in range(2)}) - X = X.set_index("Date").asfreq(pd.infer_freq(X["Date"])) - return X, y_ms - def generate_synthetic_data( period, step=None, @@ -2583,14 +2567,8 @@ def generate_synthetic_data( y (pandas.Series): A synthetic, time series target Series. """ - if period is None: - x = np.arange(0, 1, 0.01) - elif step is not None: - freq = 2 * np.pi / period / step - x = np.arange(0, 1, step) - else: - freq = 2 * np.pi / period - x = np.arange(0, period * num_periods, 1) + freq = 2 * np.pi / period + x = np.arange(0, period * num_periods, 1) dts = pd.date_range(datetime.today(), periods=len(x), freq=freq_str) X = pd.DataFrame({"x": x}) X = X.set_index(dts) @@ -2618,8 +2596,6 @@ def generate_synthetic_data( def _return_proper_func(real_or_synthetic): if real_or_synthetic == "synthetic": return generate_synthetic_data - elif real_or_synthetic == "real": - return generate_real_data return _return_proper_func From 55b5f1d10dfcd1469e90449ac443b6c9c7073bca Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 14 Aug 2023 13:53:56 -0700 Subject: [PATCH 29/47] add plot test --- .../preprocessing/stl_decomposer.py | 3 +- .../decomposer_tests/test_stl_decomposer.py | 37 +++++++++++++++++++ evalml/tests/conftest.py | 2 - 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 2f6a92eef6..7e99f22d7f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -565,8 +565,7 @@ def plot_decomposition( plot_info = {} if self.frequency and self.time_index and len(y.columns) > 1: - if isinstance(X.index, int): - X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) decomposition_results = self.get_trend_dataframe(X, y) # Iterate through each series id diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 7c71b5c8f2..5b20c92ebb 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -1,3 +1,4 @@ +import matplotlib import numpy as np import pandas as pd import pytest @@ -604,3 +605,39 @@ def assert_pred_interval_coverage(pred_interval): elif variateness == "multivariate": for id in y_validate: assert_pred_interval_coverage(trend_pred_intervals[id]) + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_plot_decomposition( + ts_data, + multiseries_ts_data_unstacked, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + X, y = multiseries_ts_data_unstacked + X.index = X["date"] + X.index.freq = "D" + + dec = STLDecomposer(time_index="date") + dec.fit_transform(X, y) + + if variateness == "univariate": + fig, axs = dec.plot_decomposition(X, y, show=False) + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + elif variateness == "multivariate": + result_plots = dec.plot_decomposition(X, y, show=False) + for id in y.columns: + fig, axs = result_plots[id] + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 6ee1c9d631..e114d54f75 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2584,8 +2584,6 @@ def generate_synthetic_data( y_trend = pd.Series(scale * minmax_scale((x - 5) ** 3 + x**2)) if period is not None: y_seasonal = pd.Series(seasonal_scale * np.sin(freq * x)) - else: - y_seasonal = pd.Series(np.zeros(len(x))) y = y_trend + y_seasonal if set_time_index: y = y.set_axis(dts) From 5a131a373fc3aca9c427a836a18474e078fb9b28 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 14 Aug 2023 14:36:41 -0700 Subject: [PATCH 30/47] check STL is detrended/deseasoned --- .../decomposer_tests/test_stl_decomposer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 5b20c92ebb..5227726594 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -169,6 +169,16 @@ def test_stl_fit_transform_in_sample( elif variateness == "multivariate": # Get the expected answer for id in y.columns: + # Check to make sure STL detrended/deseasoned + y_t_series = y_t[id] + pd.testing.assert_series_equal( + pd.Series(np.zeros(len(y_t_series))), + y_t_series, + check_exact=False, + check_index=False, + check_names=False, + atol=0.1, + ) y_series = y[id] lin_reg = LinearRegression(fit_intercept=True) features = PolynomialFeatures(degree=trend_degree).fit_transform( From 588ec95500b93a6de24aaa778537a578601e1d07 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 16 Aug 2023 12:42:24 -0700 Subject: [PATCH 31/47] get rid of class vars --- docs/source/release_notes.rst | 2 +- .../preprocessing/stl_decomposer.py | 207 ++++++++++-------- .../decomposer_tests/test_decomposer.py | 5 +- .../decomposer_tests/test_stl_decomposer.py | 4 +- 4 files changed, 127 insertions(+), 91 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index e2ed790f5c..3c69f6f298 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Extended STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Changes * Documentation Changes @@ -20,7 +21,6 @@ Release Notes * Added multiseries regression pipeline class :pr:`4256` * Added multiseries VARMAX regressor :pr:`4238` * Added support for prediction intervals for VARMAX regressor :pr:`4267` - * Extend STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Added support for pandas 2 :pr:`4216` * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258` diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 7e99f22d7f..e289f91a3e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -2,6 +2,7 @@ from __future__ import annotations import logging +from typing import Union import matplotlib.pyplot as plt import pandas as pd @@ -77,7 +78,7 @@ def __init__( **kwargs, ) - def _project_trend(self, y): + def _project_trend(self, y, trend, period): """Function to project the in-sample trend into the future.""" self._check_oos_past(y) @@ -89,7 +90,7 @@ def _project_trend(self, y): units_forward = ( len( pd.date_range( - start=self.trend.index[-1], + start=trend.index[-1], end=y.index[-1], freq=self.frequency, ), @@ -101,18 +102,18 @@ def _project_trend(self, y): # Model the trend and project it forward stlf = STLForecast( - self.trend, + trend, ARIMA, model_kwargs=dict(order=(1, 1, 0), trend="t"), - period=self.period, + period=period, ) stlf = stlf.fit() forecast = stlf.forecast(units_forward) # Store forecast summary for use in calculating trend prediction intervals. self.forecast_summary = stlf.get_prediction( - len(self.trend), - len(self.trend) + units_forward - 1, + len(trend), + len(trend) + units_forward - 1, ) # Handle out-of-sample forecasts. The forecast will have additional data @@ -129,19 +130,23 @@ def _project_trend(self, y): fore.index = y.index return fore - def _project_trend_and_seasonality(self, y): + def _project_trend_and_seasonality(self, y, trend, seasonality, periodicity): """Function to project both trend and seasonality forward into the future.""" - projected_trend = self._project_trend(y) + projected_trend = self._project_trend(y, trend, periodicity) projected_seasonality = self._project_seasonal( y, - self.seasonality, - self.period, + seasonality, + periodicity, self.frequency, ) return projected_trend, projected_seasonality - def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: + def fit( + self, + X: pd.DataFrame, + y: Union[pd.Series, pd.DataFrame] = None, + ) -> STLDecomposer: """Fits the STLDecomposer and determine the seasonal signal. Instantiates a statsmodels STL decompose object with the component's stored @@ -176,57 +181,53 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> STLDecomposer: if isinstance(y, pd.Series): y = y.to_frame() - # If there is a series_id in stacked data or more than one column in unstacked data, set multiseries to true - is_multiseries = False - if self.series_id or len(y.columns) > 1: - is_multiseries = True + self.original_index = y.index if y is not None else None + + X, y = self._check_target(X, y) + + self._map_dt_to_integer(self.original_index, y.index) + # Save the frequency of the fitted series for checking against transform data. + self.frequency = y.index.freqstr or pd.infer_freq(y.index) # Iterate through each id group - self.decompositions = {} + self.seasonals = {} + self.periods = {} + self.seasonalities = {} + self.trends = {} + self.residuals = {} for id in y.columns: series_y = y[id] - self.original_index = series_y.index if series_y is not None else None - - X, series_y = self._check_target(X, series_y) - - self._map_dt_to_integer(self.original_index, series_y.index) - # Save the frequency of the fitted series for checking against transform data. - self.frequency = series_y.index.freqstr or pd.infer_freq(series_y.index) # Determine the period of the seasonal component - if is_multiseries or self.period is None: + if id not in self.periods or self.period is None: self.set_period(X, series_y) + self.periods[id] = self.period - stl = STL(series_y, seasonal=self.seasonal_smoother, period=self.period) + stl = STL( + series_y, + seasonal=self.seasonal_smoother, + period=self.periods[id], + ) res = stl.fit() - self.seasonal = res.seasonal - self.period = stl.period - dist = len(series_y) % self.period - self.seasonality = ( - self.seasonal[-(dist + self.period) : -dist] + self.seasonals[id] = res.seasonal + self.periods[id] = stl.period + dist = len(series_y) % stl.period + self.seasonalities[id] = ( + res.seasonal[-(dist + stl.period) : -dist] if dist > 0 - else self.seasonal[-self.period :] + else res.seasonal[-stl.period :] ) - self.trend = res.trend - self.residual = res.resid - - if is_multiseries: - self.decompositions[id] = { - "seasonal": self.seasonal, - "seasonality": self.seasonality, - "trend": self.trend, - "residual": self.residual, - "period": self.period, - } + self.trends[id] = res.trend + self.residuals[id] = res.resid return self def transform( self, X: pd.DataFrame, - y: pd.DataFrame = None, - ): + y: Union[pd.Series, pd.DataFrame] = None, + ) -> Union[tuple[pd.DataFrame, pd.Series], tuple[pd.DataFrame, pd.DataFrame]]: """Transforms the target data by removing the STL trend and seasonality. Uses an ARIMA model to project forward the addititve trend and removes it. Then, utilizes the first period's @@ -255,12 +256,11 @@ def transform( # Iterate through each id group for id in y.columns: series_y = y[id] - if len(y.columns) > 1: - self.seasonality = self.decompositions[id]["seasonality"] - self.trend = self.decompositions[id]["trend"] - self.seasonal = self.decompositions[id]["seasonal"] - self.residual = self.decompositions[id]["residual"] - self.period = self.decompositions[id]["period"] + + seasonality = self.seasonalities[id] + trend = self.trends[id] + residual = self.residuals[id] + period = self.periods[id] original_index = series_y.index X, series_y = self._check_target(X, series_y) @@ -270,15 +270,15 @@ def transform( y_out_of_sample = pd.Series([]) # For partially and wholly in-sample data, retrieve stored results. - if self.trend.index[0] <= series_y.index[0] <= self.trend.index[-1]: - y_in_sample = self.residual[series_y.index[0] : series_y.index[-1]] + if trend.index[0] <= series_y.index[0] <= trend.index[-1]: + y_in_sample = residual[series_y.index[0] : series_y.index[-1]] # For out of sample data.... - if series_y.index[-1] > self.trend.index[-1]: + if series_y.index[-1] > trend.index[-1]: try: # ...that is partially out of sample and partially in sample. truncated_y = series_y[ - series_y.index.get_loc(self.trend.index[-1]) + 1 : + series_y.index.get_loc(trend.index[-1]) + 1 : ] except KeyError: # ...that is entirely out of sample. @@ -287,7 +287,12 @@ def transform( ( projected_trend, projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y) + ) = self._project_trend_and_seasonality( + truncated_y, + trend, + seasonality, + period, + ) y_out_of_sample = infer_feature_types( pd.Series( @@ -309,7 +314,10 @@ def transform( detrending_df = pd.DataFrame(detrending_list).T return X, detrending_df - def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + def inverse_transform( + self, + y_t: Union[pd.Series, pd.DataFrame], + ) -> Union[pd.Series, pd.DataFrame]: """Adds back fitted trend and seasonality to target variable. The STL trend is projected to cover the entire requested target range, then added back into the signal. Then, @@ -319,8 +327,7 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra y_t (pd.Series or pd.DataFrame): Target variable. Returns: - tuple of pd.DataFrame, pd.DataFrame: The first element are the input features returned without modification. - The second element is the target variable y with the trend and seasonality added back in. + pd.Series or pd.DataFrame: The target variable y with the trend and seasonality added back in. Raises: ValueError: If y is None. @@ -342,7 +349,9 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra series_y = y_t[id] index = self._choose_proper_index(series_y) - + old_trend = self.trends[id] + old_seasonal = self.seasonals[id] + period = self.periods[id] # For partially and wholly in-sample data, retrieve stored results. if index[0] <= series_y.index[0] <= index[-1]: left_index = series_y.index[0] @@ -353,16 +362,16 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra else series_y.index[-1] + 1 * series_y.index.freq ) trend = ( - self.trend.reset_index(drop=True)[left_index:right_index] + old_trend.reset_index(drop=True)[left_index:right_index] if isinstance(series_y.index, pd.RangeIndex) or series_y.index.is_numeric() - else self.trend[left_index:right_index] + else old_trend[left_index:right_index] ) seasonal = ( - self.seasonal.reset_index(drop=True)[left_index:right_index] + old_seasonal.reset_index(drop=True)[left_index:right_index] if isinstance(series_y.index, pd.RangeIndex) or series_y.index.is_numeric() - else self.seasonal[left_index:right_index] + else old_seasonal[left_index:right_index] ) y_in_sample = series_y + trend + seasonal y_in_sample = y_in_sample.dropna() @@ -378,7 +387,12 @@ def inverse_transform(self, y_t: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFra ( projected_trend, projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y_t) + ) = self._project_trend_and_seasonality( + truncated_y_t, + old_trend, + old_seasonal, + period, + ) y_out_of_sample = infer_feature_types( pd.Series( @@ -406,9 +420,10 @@ def get_trend_dataframe(self, X, y): a DataFrame for multivariate problems. Returns: - list of pd.DataFrame: Each DataFrame contains the columns "signal", "trend", "seasonality" and "residual," + (Single series) list of pd.DataFrame: Each DataFrame contains the columns "signal", "trend", "seasonality" and "residual," with the latter 3 column values being the decomposed elements of the target data. The "signal" column is simply the input target signal but reindexed with a datetime index to match the input features. + (Multi series) dictionary of lists: Series id maps to a list of pd.DataFrames that each contain the columns "signal", "trend", "seasonality" and "residual" Raises: TypeError: If X does not have time-series data in the index. @@ -426,24 +441,24 @@ def get_trend_dataframe(self, X, y): self._check_oos_past(y) - def _decompose_target(X, y, fh): + def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(self.trend.index) and all( - y.index == self.trend.index, + if len(y.index) == len(trend.index) and all( + y.index == trend.index, ): - trend = self.trend - seasonal = self.seasonal - residual = self.residual + trend = trend + seasonal = seasonal + residual = residual else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, - period=self.period, + period=period, ) decomposer.fit(X, y) - trend = decomposer.trend - seasonal = decomposer.seasonal - residual = decomposer.residual + trend = decomposer.trends[id] + seasonal = decomposer.seasonals[id] + residual = decomposer.residuals[id] return pd.DataFrame( { "signal": y, @@ -466,16 +481,25 @@ def _decompose_target(X, y, fh): "Provided DatetimeIndex of X should have an inferred frequency.", ) - if len(y.columns) > 1: - self.seasonality = self.decompositions[id]["seasonality"] - self.seasonal = self.decompositions[id]["seasonal"] - self.trend = self.decompositions[id]["trend"] - self.residual = self.decompositions[id]["residual"] - self.period = self.decompositions[id]["period"] + seasonal = self.seasonals[id] + trend = self.trends[id] + residual = self.residuals[id] + period = self.periods[id] series_y = y[id] if isinstance(series_y, pd.Series): - result_dfs.append(_decompose_target(X, series_y, None)) + result_dfs.append( + _decompose_target( + X, + series_y, + None, + trend, + seasonal, + residual, + period, + id, + ), + ) series_results[id] = result_dfs @@ -493,25 +517,34 @@ def get_trend_prediction_intervals(self, y, coverage=None): prediction interval should be calculated for. Returns: - dict of pd.Series: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. + (Single series) dict of pd.Series: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. + (Multi series) dict of dict of pd.Series: Each series id maps to a dictionary of prediction intervals """ if isinstance(y, pd.Series): y = y.to_frame() + if coverage is None: + coverage = [0.95] + series_results = {} for id in y.columns: y_series = y[id] - if coverage is None: - coverage = [0.95] - self._check_oos_past(y_series) alphas = [1 - val for val in coverage] + trend = self.trends[id] + seasonality = self.seasonalities[id] + period = self.periods[id] if not self.forecast_summary or len(y_series) != len( self.forecast_summary.predicted_mean, ): - self._project_trend_and_seasonality(y_series) + self._project_trend_and_seasonality( + y_series, + trend, + seasonality, + period, + ) prediction_interval_result = {} for i, alpha in enumerate(alphas): diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index 9e5435c0ec..642b3d7333 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -284,7 +284,10 @@ def test_decomposer_prefers_users_time_index( X_t, y_t = dec.fit_transform(X, y) else: X_t, y_t = dec.fit_transform(X, y) - assert all(dec.trend.index.values == expected_values) + if isinstance(dec, STLDecomposer): + assert all(dec.trends[0].index.values == expected_values) + elif isinstance(dec, PolynomialDecomposer): + assert all(dec.trend.index.values == expected_values) @pytest.mark.parametrize( diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 5227726594..f9936f718b 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -160,7 +160,7 @@ def test_stl_fit_transform_in_sample( # Check the trend to make sure STL worked properly pd.testing.assert_series_equal( pd.Series(expected_trend), - pd.Series(stl.trend), + pd.Series(stl.trends[0]), check_exact=False, check_index=False, check_names=False, @@ -189,7 +189,7 @@ def test_stl_fit_transform_in_sample( # Check the trend to make sure STL worked properly pd.testing.assert_series_equal( pd.Series(expected_trend), - pd.Series(stl.decompositions[id]["trend"]), + pd.Series(stl.trends[id]), check_exact=False, check_index=False, check_names=False, From 6819ca1e4b0dbce5f00dc38edb532ed69d2882f4 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 16 Aug 2023 12:55:50 -0700 Subject: [PATCH 32/47] change parent function instead of overload --- .../transformers/preprocessing/decomposer.py | 54 +++++++++++---- .../preprocessing/stl_decomposer.py | 65 ------------------- 2 files changed, 40 insertions(+), 79 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 3f3d0e0718..cf18b9e52b 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -324,7 +324,7 @@ def _project_seasonal( def plot_decomposition( self, X: pd.DataFrame, - y: pd.Series, + y: pd.DataFrame, show: bool = False, ) -> tuple[plt.Figure, list]: """Plots the decomposition of the target signal. @@ -340,20 +340,46 @@ def plot_decomposition( plotted on them """ + if isinstance(y, pd.Series): + y = y.to_frame() + + plot_info = {} + if self.frequency and self.time_index and len(y.columns) > 1: + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) decomposition_results = self.get_trend_dataframe(X, y) - fig, axs = plt.subplots(4) - fig.set_size_inches(18.5, 14.5) - axs[0].plot(decomposition_results[0]["signal"], "r") - axs[0].set_title("signal") - axs[1].plot(decomposition_results[0]["trend"], "b") - axs[1].set_title("trend") - axs[2].plot(decomposition_results[0]["seasonality"], "g") - axs[2].set_title("seasonality") - axs[3].plot(decomposition_results[0]["residual"], "y") - axs[3].set_title("residual") - if show: # pragma: no cover - plt.show() - return fig, axs + + # Iterate through each series id + for id in y.columns: + fig, axs = plt.subplots(4) + fig.set_size_inches(18.5, 14.5) + + for ax in axs: + ax.cla() + + if len(y.columns) > 1: + results = decomposition_results[id] + else: + results = decomposition_results + axs[0].plot(results[0]["signal"], "r") + axs[0].set_title("signal") + axs[1].plot(results[0]["trend"], "b") + axs[1].set_title("trend") + axs[2].plot(results[0]["seasonality"], "g") + axs[2].set_title("seasonality") + axs[3].plot(results[0]["residual"], "y") + axs[3].set_title("residual") + + # If multiseries, return a dictionary of tuples + if len(y.columns) > 1: + fig.suptitle("Decomposition for Series {}".format(id)) + plot_info[id] = (fig, axs) + else: + plot_info = (fig, axs) + + if show: # pragma: no cover + plt.show() + + return plot_info def _check_target(self, X: pd.DataFrame, y: pd.Series): """Function to ensure target is not None and has a pandas.DatetimeIndex.""" diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index e289f91a3e..d01eca5c7a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -4,7 +4,6 @@ import logging from typing import Union -import matplotlib.pyplot as plt import pandas as pd from pandas import RangeIndex from statsmodels.tsa.arima.model import ARIMA @@ -569,67 +568,3 @@ def get_trend_prediction_intervals(self, y, coverage=None): if len(y.columns) <= 1: return prediction_interval_result return series_results - - # Overload the plot_decomposition fucntion to be able to plot multiple decompositions for multiseries - def plot_decomposition( - self, - X: pd.DataFrame, - y: pd.DataFrame, - show: bool = False, - ): - """Plots the decomposition of the target signal. - - Args: - X (pd.DataFrame): Input data with time series data in index. - y (pd.Series or pd.DataFrame): Target variable data provided as a Series for univariate problems or - a DataFrame for multivariate problems. - show (bool): Whether to display the plot or not. Defaults to False. - - Returns: - (Single series) matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions - plotted on them - (Multi series) dict[matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]]: A dictionary that maps the series id to - the figure and axes that have the decompositions plotted on them - - - """ - if isinstance(y, pd.Series): - y = y.to_frame() - - plot_info = {} - if self.frequency and self.time_index and len(y.columns) > 1: - X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) - decomposition_results = self.get_trend_dataframe(X, y) - - # Iterate through each series id - for id in y.columns: - fig, axs = plt.subplots(4) - fig.set_size_inches(18.5, 14.5) - - for ax in axs: - ax.cla() - - if len(y.columns) > 1: - results = decomposition_results[id] - else: - results = decomposition_results - axs[0].plot(results[0]["signal"], "r") - axs[0].set_title("signal") - axs[1].plot(results[0]["trend"], "b") - axs[1].set_title("trend") - axs[2].plot(results[0]["seasonality"], "g") - axs[2].set_title("seasonality") - axs[3].plot(results[0]["residual"], "y") - axs[3].set_title("residual") - - # If multiseries, return a dictionary of tuples - if len(y.columns) > 1: - fig.suptitle("Decomposition for Series {}".format(id)) - plot_info[id] = (fig, axs) - else: - plot_info = (fig, axs) - - if show: # pragma: no cover - plt.show() - - return plot_info From 1083245147fca67df48253274e30e3d786564ec0 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 16 Aug 2023 15:56:37 -0700 Subject: [PATCH 33/47] ability to transform on new data --- .../preprocessing/stl_decomposer.py | 55 +++++++++++++------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index d01eca5c7a..b50c5437bd 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -211,12 +211,12 @@ def fit( self.seasonals[id] = res.seasonal self.periods[id] = stl.period dist = len(series_y) % stl.period - self.seasonalities[id] = ( + seasonality = ( res.seasonal[-(dist + stl.period) : -dist] if dist > 0 else res.seasonal[-stl.period :] ) - + self.seasonalities[id] = seasonality self.trends[id] = res.trend self.residuals[id] = res.resid @@ -256,10 +256,16 @@ def transform( for id in y.columns: series_y = y[id] - seasonality = self.seasonalities[id] - trend = self.trends[id] - residual = self.residuals[id] - period = self.periods[id] + if len(y.columns) > 1: + seasonality = self.seasonalities[id] + trend = self.trends[id] + residual = self.residuals[id] + period = self.periods[id] + else: + seasonality = list(self.seasonalities.values())[0] + trend = list(self.trends.values())[0] + residual = list(self.residuals.values())[0] + period = list(self.periods.values())[0] original_index = series_y.index X, series_y = self._check_target(X, series_y) @@ -348,9 +354,15 @@ def inverse_transform( series_y = y_t[id] index = self._choose_proper_index(series_y) - old_trend = self.trends[id] - old_seasonal = self.seasonals[id] - period = self.periods[id] + + if len(y_t.columns) > 1: + old_trend = self.trends[id] + old_seasonal = self.seasonals[id] + period = self.periods[id] + else: + old_trend = list(self.trends.values())[0] + old_seasonal = list(self.seasonals.values())[0] + period = list(self.periods.values())[0] # For partially and wholly in-sample data, retrieve stored results. if index[0] <= series_y.index[0] <= index[-1]: left_index = series_y.index[0] @@ -480,10 +492,16 @@ def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): "Provided DatetimeIndex of X should have an inferred frequency.", ) - seasonal = self.seasonals[id] - trend = self.trends[id] - residual = self.residuals[id] - period = self.periods[id] + if len(y.columns) > 1: + seasonal = self.seasonals[id] + trend = self.trends[id] + residual = self.residuals[id] + period = self.periods[id] + else: + seasonal = list(self.seasonals.values())[0] + trend = list(self.trends.values())[0] + residual = list(self.residuals.values())[0] + period = list(self.periods.values())[0] series_y = y[id] if isinstance(series_y, pd.Series): @@ -532,9 +550,14 @@ def get_trend_prediction_intervals(self, y, coverage=None): self._check_oos_past(y_series) alphas = [1 - val for val in coverage] - trend = self.trends[id] - seasonality = self.seasonalities[id] - period = self.periods[id] + if len(y.columns) > 1: + trend = self.trends[id] + seasonality = self.seasonalities[id] + period = self.periods[id] + else: + trend = list(self.trends.values())[0] + seasonality = list(self.seasonalities.values())[0] + period = list(self.periods.values())[0] if not self.forecast_summary or len(y_series) != len( self.forecast_summary.predicted_mean, ): From 998d96472bde83914fa367186d1d669e7baed9be Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 17 Aug 2023 16:23:38 -0700 Subject: [PATCH 34/47] get rid of step param --- .../tests/component_tests/decomposer_tests/test_decomposer.py | 1 - evalml/tests/conftest.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index 642b3d7333..a20e310cfc 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -86,7 +86,6 @@ def test_decomposer_plot_decomposition( ) X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( period, - step, ) if y_has_time_index == "y_has_time_index": y = y.set_axis(X.index) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index e114d54f75..29b6fed30e 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2535,7 +2535,6 @@ def generate_multiseries_seasonal_data(): def generate_synthetic_data( period, - step=None, num_periods=20, scale=1, seasonal_scale=1, @@ -2547,7 +2546,6 @@ def generate_synthetic_data( Args: period: The length, in units, of the seasonal signal. - step: num_periods: How many periods of the seasonal signal to generate. scale: The relative scale of the trend. Setting it higher increases the comparative strength of the trend. From 2f32f5b016c5c9f7a28c5043df43fae8ac7798a6 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 18 Aug 2023 09:41:38 -0700 Subject: [PATCH 35/47] fix data types and duplicate lines --- .../components/transformers/preprocessing/decomposer.py | 9 ++++++--- .../transformers/preprocessing/stl_decomposer.py | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index cf18b9e52b..06a7d30bfc 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -3,6 +3,7 @@ import re from abc import abstractmethod +from typing import Union import matplotlib.pyplot as plt import numpy as np @@ -324,9 +325,9 @@ def _project_seasonal( def plot_decomposition( self, X: pd.DataFrame, - y: pd.DataFrame, + y: Union[pd.Series, pd.DataFrame], show: bool = False, - ) -> tuple[plt.Figure, list]: + ) -> Union[tuple[plt.Figure, list], dict[str, tuple[plt.Figure]]]: """Plots the decomposition of the target signal. Args: @@ -336,8 +337,10 @@ def plot_decomposition( show (bool): Whether to display the plot or not. Defaults to False. Returns: - matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions + (Single series) matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions plotted on them + (Multi series) dict[str, (matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes])]: A dictionary that maps the series id + to the figure and axes that have the decompositions plotted on them """ if isinstance(y, pd.Series): diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index b50c5437bd..eeee6adb1c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -175,7 +175,6 @@ def fit( self.logger.warning( f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) - X, y = self._check_target(X, y) if isinstance(y, pd.Series): y = y.to_frame() From 02733c374729d3f1022806787358b78c841ba752 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 18 Aug 2023 10:00:24 -0700 Subject: [PATCH 36/47] remove stuff from loops --- .../transformers/preprocessing/stl_decomposer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index eeee6adb1c..d0be69088f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -250,6 +250,11 @@ def transform( if isinstance(y, pd.Series): y = y.to_frame() + + original_index = y.index + X, y = self._check_target(X, y) + self._check_oos_past(y) + detrending_list = [] # Iterate through each id group for id in y.columns: @@ -266,10 +271,6 @@ def transform( residual = list(self.residuals.values())[0] period = list(self.periods.values())[0] - original_index = series_y.index - X, series_y = self._check_target(X, series_y) - self._check_oos_past(series_y) - y_in_sample = pd.Series([]) y_out_of_sample = pd.Series([]) @@ -542,11 +543,11 @@ def get_trend_prediction_intervals(self, y, coverage=None): if coverage is None: coverage = [0.95] + self._check_oos_past(y) series_results = {} for id in y.columns: y_series = y[id] - self._check_oos_past(y_series) alphas = [1 - val for val in coverage] if len(y.columns) > 1: From 09e31f20437e6ff2ef1cf4b806159e2ca21ca039 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 21 Aug 2023 17:10:19 -0700 Subject: [PATCH 37/47] condense code --- .../transformers/preprocessing/decomposer.py | 31 +- .../preprocessing/stl_decomposer.py | 26 +- .../decomposer_tests/test_decomposer.py | 266 ++++++++---------- .../decomposer_tests/test_stl_decomposer.py | 265 +++++++---------- evalml/tests/conftest.py | 27 +- 5 files changed, 279 insertions(+), 336 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 06a7d30bfc..53d2260b7b 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -190,6 +190,34 @@ def _detrend_on_fly(X, y): relative_maxima = _get_rel_max_from_acf(y_detrended) return relative_maxima + # def set_period( + # self, + # X: pd.DataFrame, + # y: pd.Series, + # acf_threshold: float = 0.01, + # rel_max_order: int = 5, + # ): + # """Function to set the component's seasonal period based on the target's seasonality. + + # Args: + # X (pandas.DataFrame): The feature data of the time series problem. + # y (pandas.Series): The target data of a time series problem. + # acf_threshold (float) : The threshold for the autocorrelation function to determine the period. Any values below + # the threshold are considered to be 0 and will not be considered for the period. Defaults to 0.01. + # rel_max_order (int) : The order of the relative maximum to determine the period. Defaults to 5. + + # """ + # self.periods = {} + # if len(y.columns) == 1: + # self.period = self.determine_periodicity(X, y, acf_threshold, rel_max_order) + # self.update_parameters({"period": self.period}) + # self.periods[id] = self.period + # return + # else: + # for id in y.columns: + # self.periods[id] = self.determine_periodicity(X, y[id], acf_threshold, rel_max_order) + # self.update_parameters({"periods": self.periods}) + def set_period( self, X: pd.DataFrame, @@ -356,9 +384,6 @@ def plot_decomposition( fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) - for ax in axs: - ax.cla() - if len(y.columns) > 1: results = decomposition_results[id] else: diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index d0be69088f..33061d38b2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -45,12 +45,14 @@ def __init__( series_id: str = None, degree: int = 1, # Currently unused. period: int = None, + periods: dict = None, seasonal_smoother: int = 7, random_seed: int = 0, **kwargs, ): self.logger = logging.getLogger(__name__) self.series_id = series_id + self.periods = periods # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -64,6 +66,7 @@ def __init__( parameters = { "degree": degree, "period": period, + "periods": periods, "seasonal_smoother": seasonal_smoother, "time_index": time_index, "series_id": series_id, @@ -189,10 +192,22 @@ def fit( self.frequency = y.index.freqstr or pd.infer_freq(y.index) # Iterate through each id group self.seasonals = {} - self.periods = {} self.seasonalities = {} self.trends = {} self.residuals = {} + self.periods = {} + + # # Determine the period of the seasonal component + # # Set the period if it is single series and period is given + # if self.period is not None and len(y.columns) == 1: + # self.periods = {0: self.period} + # # Set periods if it is single series and period is + # if self.periods is None or self.period is None: + # self.set_period(X, y) + + # if self.period is None: + # self.set_period(X, y) + for id in y.columns: series_y = y[id] @@ -347,14 +362,13 @@ def inverse_transform( if isinstance(y_t, pd.Series): y_t = y_t.to_frame() + index = self._choose_proper_index(y_t) y = [] for id in y_t.columns: y_in_sample = pd.Series([]) y_out_of_sample = pd.Series([]) series_y = y_t[id] - index = self._choose_proper_index(series_y) - if len(y_t.columns) > 1: old_trend = self.trends[id] old_seasonal = self.seasonals[id] @@ -454,13 +468,9 @@ def get_trend_dataframe(self, X, y): def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(trend.index) and all( + if len(y.index) != len(trend.index) or not all( y.index == trend.index, ): - trend = trend - seasonal = seasonal - residual = residual - else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index a20e310cfc..f4e26f4361 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -72,21 +72,23 @@ def test_decomposer_plot_decomposition( decomposer_child_class, y_has_time_index, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + step = 0.01 period = 9 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )(period, step) + if y_has_time_index == "y_has_time_index": y = y.set_axis(X.index) @@ -138,7 +140,7 @@ def test_decomposer_plot_decomposition( def test_decomposer_uses_time_index( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, X_has_time_index, X_num_time_columns, @@ -152,10 +154,7 @@ def test_decomposer_uses_time_index( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - y = y.set_axis(X.index) - X.ww.init() + X, _, y = ts_multiseries_data() time_index_col_name = "date" assert isinstance(X.index, pd.DatetimeIndex) @@ -453,7 +452,7 @@ def test_decomposer_projected_seasonality_integer_and_datetime( def test_decomposer_get_trend_dataframe_raises_errors( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -463,12 +462,7 @@ def test_decomposer_get_trend_dataframe_raises_errors( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - dts = pd.date_range("01-01-2000", periods=len(X)) - datetime_index = pd.DatetimeIndex(dts) - X.index = datetime_index - y.index = datetime_index - X["date"] = dts + X, _, y = ts_multiseries_data() dec = decomposer_child_class() dec.fit_transform(X, y) @@ -629,6 +623,7 @@ def test_decomposer_determine_periodicity_nullable_type_incompatibility( def test_decomposer_get_trend_dataframe_error_not_fit( decomposer_child_class, ts_data, + ts_multiseries_data, multiseries_ts_data_unstacked, variateness, fit_before_decompose, @@ -640,10 +635,8 @@ def test_decomposer_get_trend_dataframe_error_not_fit( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - X.index.freq = "D" - + X, _, y = ts_multiseries_data() + # X, y = multiseries_ts_data_unstacked dec = decomposer_child_class(time_index="date") if fit_before_decompose: dec.fit_transform(X, y) @@ -669,7 +662,7 @@ def test_decomposer_get_trend_dataframe_error_not_fit( def test_decomposer_transform_returns_same_when_y_none( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -679,7 +672,7 @@ def test_decomposer_transform_returns_same_when_y_none( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dec = decomposer_child_class().fit(X, y) X_t, y_t = dec.transform(X, None) @@ -701,7 +694,7 @@ def test_decomposer_transform_returns_same_when_y_none( def test_decomposer_raises_value_error_target_is_none( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -711,7 +704,7 @@ def test_decomposer_raises_value_error_target_is_none( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() with pytest.raises(ValueError, match="cannot be None for Decomposer!"): decomposer_child_class(degree=3).fit_transform(X, None) @@ -739,7 +732,7 @@ def test_decomposer_raises_value_error_target_is_none( def test_decomposer_bad_target_index( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -749,7 +742,7 @@ def test_decomposer_bad_target_index( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dec = decomposer_child_class() y.index = pd.CategoricalIndex(["cat_index" for x in range(len(y))]) @@ -786,48 +779,52 @@ def test_decomposer_bad_target_index( def test_decomposer_fit_transform_out_of_sample( decomposer_child_class, variateness, - generate_multiseries_seasonal_data, generate_seasonal_data, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - subset_y = y[2 * period : 7 * period] - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - subset_y = y.loc[y.index[2 * period : 7 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + subset_y = y.loc[y.index[2 * period : 7 * period]] subset_X = X[2 * period : 7 * period] decomposer = decomposer_child_class(period=period) decomposer.fit(subset_X, subset_y) if transformer_fit_on_data == "in-sample": - if variateness == "univariate": - output_X, output_y = decomposer.transform(subset_X, subset_y) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index), - output_y, - check_dtype=False, - check_names=False, - atol=0.2, - ) + output_X, output_y = decomposer.transform(subset_X, subset_y) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = y_expected = pd.DataFrame( + [np.zeros(len(output_y)), np.zeros(len(output_y))], + ).T.set_axis(subset_y.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index) + assert_function( + y_expected, + output_y, + check_dtype=False, + check_names=False, + atol=0.2, + ) if transformer_fit_on_data != "in-sample": y_new = build_test_target( @@ -846,25 +843,23 @@ def test_decomposer_fit_transform_out_of_sample( ): output_X, output_inverse_y = decomposer.transform(None, y_new) else: - if variateness == "univariate": - output_X, output_y_t = decomposer.transform(None, y[y_new.index]) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), - output_y_t, - check_exact=False, - atol=0.1, # STLDecomposer is within atol=5.0e-4 - ) - elif variateness == "multivariate": + output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal y_new = pd.DataFrame([y_new, y_new]).T - output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) - pd.testing.assert_frame_equal( - pd.DataFrame( - [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], - ).T.set_axis(y_new.index), - output_y_t, - check_exact=False, - atol=0.1, # STLDecomposer is within atol=5.0e-4 - ) + y_expected = pd.DataFrame( + [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], + ).T.set_axis(y_new.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index) + + assert_function( + y_expected, + output_y_t, + check_exact=False, + atol=0.1, # STLDecomposer is within atol=5.0e-4 + ) @pytest.mark.parametrize( @@ -895,60 +890,50 @@ def test_decomposer_inverse_transform( decomposer_child_class, index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y[: 5 * period] - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y.loc[y.index[: 5 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] decomposer = decomposer_child_class(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(subset_y), - output_inverse_y, - check_dtype=False, - ) - elif ( - isinstance(decomposer, PolynomialDecomposer) - or isinstance(decomposer, STLDecomposer) - and variateness == "univariate" - ): - pd.testing.assert_series_equal( - pd.Series(subset_y), - output_inverse_y, - check_dtype=False, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(subset_y) + assert_function( + y_expected, + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -972,25 +957,20 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. - if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(y.loc[y_t_new.index]), - output_inverse_y, - check_exact=False, - rtol=1.0e-1, - ) - elif ( - isinstance(decomposer, PolynomialDecomposer) - or isinstance(decomposer, STLDecomposer) - and variateness == "univariate" - ): - pd.testing.assert_series_equal( - pd.Series(y[y_t_new.index]), - output_inverse_y, - check_exact=False, - check_index=False, - rtol=1.0e-1, - ) + + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(y[y_t_new.index]) + assert_function( + y_expected, + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) + pd.testing.assert_index_equal( y.loc[y_t_new.index].index, output_inverse_y.index, @@ -1040,7 +1020,7 @@ def test_decomposer_doesnt_modify_target_index( def test_decomposer_monthly_begin_data( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -1050,7 +1030,7 @@ def test_decomposer_monthly_begin_data( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") datetime_index = pd.DatetimeIndex(dts) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index f9936f718b..52b9cf532c 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -17,6 +17,7 @@ def test_stl_decomposer_init(): assert decomp.parameters == { "degree": 3, "period": None, + "periods": None, "seasonal_smoother": 7, "time_index": "dates", "series_id": None, @@ -28,6 +29,7 @@ def test_stl_decomposer_multiseries_init(): assert decomp.parameters == { "degree": 3, "period": None, + "periods": None, "seasonal_smoother": 7, "time_index": "dates", "series_id": "ids", @@ -52,13 +54,13 @@ def test_stl_decomposer_auto_sets_seasonal_smoother_to_odd(): def test_stl_raises_warning_high_smoother( caplog, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": X, _, y = ts_data() elif variateness == "multivariate": - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() stl = STLDecomposer(seasonal_smoother=101) stl.fit(X, y) assert "STLDecomposer may perform poorly" in caplog.text @@ -119,39 +121,42 @@ def test_stl_fit_transform_in_sample( freq, trend_degree, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period, - freq_str=freq, - trend_degree=trend_degree, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period, - freq_str=freq, - trend_degree=trend_degree, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period, + freq_str=freq, + trend_degree=trend_degree, + ) stl = STLDecomposer(period=period) X_t, y_t = stl.fit_transform(X, y) - if variateness == "univariate": + # If y_t is a pd.Series, give it columns + if isinstance(y_t, pd.Series): + y_t = y_t.to_frame() + if isinstance(y, pd.Series): + y = y.to_frame() + # Get the expected answer + for id in y_t.columns: + y_t_series = y_t[id] + y_series = y[id] # Get the expected answer lin_reg = LinearRegression(fit_intercept=True) features = PolynomialFeatures(degree=trend_degree).fit_transform( np.arange(X.shape[0]).reshape(-1, 1), ) - lin_reg.fit(features, y) + lin_reg.fit(features, y_series) expected_trend = lin_reg.predict(features) # Check to make sure STL detrended/deseasoned pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t))), - y_t, + pd.Series(np.zeros(len(y_t_series))), + y_t_series, check_exact=False, check_index=False, check_names=False, @@ -166,36 +171,6 @@ def test_stl_fit_transform_in_sample( check_names=False, atol=0.3, ) - elif variateness == "multivariate": - # Get the expected answer - for id in y.columns: - # Check to make sure STL detrended/deseasoned - y_t_series = y_t[id] - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t_series))), - y_t_series, - check_exact=False, - check_index=False, - check_names=False, - atol=0.1, - ) - y_series = y[id] - lin_reg = LinearRegression(fit_intercept=True) - features = PolynomialFeatures(degree=trend_degree).fit_transform( - np.arange(X.shape[0]).reshape(-1, 1), - ) - lin_reg.fit(features, y_series) - expected_trend = lin_reg.predict(features) - # Check the trend to make sure STL worked properly - pd.testing.assert_series_equal( - pd.Series(expected_trend), - pd.Series(stl.trends[id]), - check_exact=False, - check_index=False, - check_names=False, - atol=0.3, - ) - # Verify the X is not changed pd.testing.assert_frame_equal(X, X_t) @@ -223,50 +198,37 @@ def test_stl_fit_transform_in_sample( def test_stl_decomposer_inverse_transform( index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_X = X[: 5 * period] - subset_y = y[: 5 * period] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y.loc[y.index[: 5 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] + decomposer = STLDecomposer(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - if variateness == "univariate": - pd.testing.assert_series_equal( - subset_y, - output_inverse_y, - check_dtype=False, - ) - elif variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(subset_y), - output_inverse_y, - check_dtype=False, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = subset_y + assert_function(y_expected, output_inverse_y, check_dtype=False) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -291,20 +253,19 @@ def test_stl_decomposer_inverse_transform( # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) - if variateness == "univariate": - pd.testing.assert_series_equal( - y[y_t_new.index], - output_inverse_y, - check_index=False, - rtol=1.0e-2, - ) - elif variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(y.loc[y_t_new.index]), - output_inverse_y, - check_exact=False, - rtol=1.0e-1, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = y[y_t_new.index] + assert_function( + y_expected, + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) + pd.testing.assert_index_equal( y.loc[y_t_new.index].index, output_inverse_y.index, @@ -334,29 +295,21 @@ def test_stl_decomposer_inverse_transform( @pytest.mark.parametrize("fit_before_decompose", [True, False]) def test_stl_decomposer_get_trend_dataframe( generate_seasonal_data, - generate_multiseries_seasonal_data, transformer_fit_on_data, fit_before_decompose, variateness, ): period = 7 - - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - subset_y = y[: 5 * period] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - subset_y = y.loc[y.index[: 5 * period]] - + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] if transformer_fit_on_data == "in-sample": dec = STLDecomposer() @@ -446,19 +399,15 @@ def test_stl_decomposer_get_trend_dataframe( ) def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) assert not isinstance(y.index, pd.DatetimeIndex) @@ -493,22 +442,19 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( def test_unsupported_frequencies( bad_frequency, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - freq_str=bad_frequency, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - freq_str=bad_frequency, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + freq_str=bad_frequency, + ) + stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) assert stl.period is not None @@ -523,19 +469,15 @@ def test_unsupported_frequencies( ) def test_stl_decomposer_doesnt_modify_target_index( generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) original_X_index = X.index original_y_index = y.index @@ -566,28 +508,21 @@ def test_stl_decomposer_get_trend_prediction_intervals( set_coverage, index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): coverage = [0.75, 0.85, 0.95] if set_coverage else None period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - y_train = y[: 15 * period] - y_validate = y[15 * period :] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - y_train = y.loc[y.index[: 15 * period]] - y_validate = y.loc[y.index[15 * period :]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) X_train = X[: 15 * period] + y_train = y.loc[y.index[: 15 * period]] + y_validate = y.loc[y.index[15 * period :]] stl = STLDecomposer() stl.fit(X_train, y_train) @@ -626,15 +561,13 @@ def assert_pred_interval_coverage(pred_interval): ) def test_stl_decomposer_plot_decomposition( ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": X, _, y = ts_data() elif variateness == "multivariate": - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - X.index.freq = "D" + X, _, y = ts_multiseries_data() dec = STLDecomposer(time_index="date") dec.fit_transform(X, y) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 29b6fed30e..6306844a94 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2520,21 +2520,9 @@ def generate_synthetic_data( y = y.set_axis(dts) return X, y - def _return_proper_func(real_or_synthetic): - if real_or_synthetic == "synthetic": - return generate_synthetic_data - elif real_or_synthetic == "real": - return generate_real_data - - return _return_proper_func - - -@pytest.fixture -def generate_multiseries_seasonal_data(): - """Function that returns data with a linear trend and a seasonal signal with specified period for multiseries.""" - - def generate_synthetic_data( + def generate_multiseries_synthetic_data( period, + step=None, num_periods=20, scale=1, seasonal_scale=1, @@ -2589,9 +2577,16 @@ def generate_synthetic_data( y_ms = pd.DataFrame(y_ms_list).T return X, y_ms - def _return_proper_func(real_or_synthetic): - if real_or_synthetic == "synthetic": + def _return_proper_func(real_or_synthetic, univariate_or_multivariate="univariate"): + if ( + real_or_synthetic == "synthetic" + and univariate_or_multivariate == "univariate" + ): return generate_synthetic_data + elif real_or_synthetic == "real" and univariate_or_multivariate == "univariate": + return generate_real_data + if univariate_or_multivariate == "multivariate": + return generate_multiseries_synthetic_data return _return_proper_func From 837fc79775e68fac9bf79ad5ffdc160e3ec9cd06 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 22 Aug 2023 10:10:07 -0700 Subject: [PATCH 38/47] periods parameters --- .../transformers/preprocessing/decomposer.py | 28 ------------ .../preprocessing/stl_decomposer.py | 41 ++++++++--------- .../decomposer_tests/test_decomposer.py | 44 ------------------- .../test_polynomial_decomposer.py | 23 ++++++++++ .../decomposer_tests/test_stl_decomposer.py | 34 +++++++++++++- 5 files changed, 74 insertions(+), 96 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 53d2260b7b..d93ae6353a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -190,34 +190,6 @@ def _detrend_on_fly(X, y): relative_maxima = _get_rel_max_from_acf(y_detrended) return relative_maxima - # def set_period( - # self, - # X: pd.DataFrame, - # y: pd.Series, - # acf_threshold: float = 0.01, - # rel_max_order: int = 5, - # ): - # """Function to set the component's seasonal period based on the target's seasonality. - - # Args: - # X (pandas.DataFrame): The feature data of the time series problem. - # y (pandas.Series): The target data of a time series problem. - # acf_threshold (float) : The threshold for the autocorrelation function to determine the period. Any values below - # the threshold are considered to be 0 and will not be considered for the period. Defaults to 0.01. - # rel_max_order (int) : The order of the relative maximum to determine the period. Defaults to 5. - - # """ - # self.periods = {} - # if len(y.columns) == 1: - # self.period = self.determine_periodicity(X, y, acf_threshold, rel_max_order) - # self.update_parameters({"period": self.period}) - # self.periods[id] = self.period - # return - # else: - # for id in y.columns: - # self.periods[id] = self.determine_periodicity(X, y[id], acf_threshold, rel_max_order) - # self.update_parameters({"periods": self.periods}) - def set_period( self, X: pd.DataFrame, diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 33061d38b2..f1139a6036 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -195,26 +195,27 @@ def fit( self.seasonalities = {} self.trends = {} self.residuals = {} - self.periods = {} - - # # Determine the period of the seasonal component - # # Set the period if it is single series and period is given - # if self.period is not None and len(y.columns) == 1: - # self.periods = {0: self.period} - # # Set periods if it is single series and period is - # if self.periods is None or self.period is None: - # self.set_period(X, y) - - # if self.period is None: - # self.set_period(X, y) + if self.periods is None: + self.periods = {} for id in y.columns: series_y = y[id] # Determine the period of the seasonal component - if id not in self.periods or self.period is None: - self.set_period(X, series_y) - self.periods[id] = self.period + if id not in self.periods: + period = self.determine_periodicity( + X, + series_y, + acf_threshold=0.01, + rel_max_order=5, + ) + if self.period is None and len(y.columns) == 1: + self.period = period + self.update_parameters({"period": self.period}) + elif self.period is not None and len(y.columns) == 1: + period = self.period + self.periods[id] = period + self.update_parameters({"periods": self.periods}) stl = STL( series_y, @@ -463,7 +464,8 @@ def get_trend_dataframe(self, X, y): # in ForecastingHorizon during decomposition. if not isinstance(y.index, pd.DatetimeIndex): y = self._set_time_index(X, y) - + if not isinstance(X.index, pd.DatetimeIndex): + X.index = y.index self._check_oos_past(y) def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): @@ -495,13 +497,6 @@ def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): # Iterate through each series id for id in y.columns: result_dfs = [] - if not isinstance(X.index, pd.DatetimeIndex): - raise TypeError("Provided X should have datetimes in the index.") - if X.index.freq is None: - raise ValueError( - "Provided DatetimeIndex of X should have an inferred frequency.", - ) - if len(y.columns) > 1: seasonal = self.seasonals[id] trend = self.trends[id] diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index f4e26f4361..c50c8193ab 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -438,50 +438,6 @@ def test_decomposer_projected_seasonality_integer_and_datetime( ) -@pytest.mark.parametrize( - "decomposer_child_class", - decomposer_list, -) -@pytest.mark.parametrize( - "variateness", - [ - "univariate", - "multivariate", - ], -) -def test_decomposer_get_trend_dataframe_raises_errors( - decomposer_child_class, - ts_data, - ts_multiseries_data, - variateness, -): - if variateness == "univariate": - X, _, y = ts_data() - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, _, y = ts_multiseries_data() - - dec = decomposer_child_class() - dec.fit_transform(X, y) - - with pytest.raises( - TypeError, - match="Provided X should have datetimes in the index.", - ): - X_int_index = X.reset_index() - dec.get_trend_dataframe(X_int_index, y) - - with pytest.raises( - ValueError, - match="Provided DatetimeIndex of X should have an inferred frequency.", - ): - X.index.freq = None - dec.get_trend_dataframe(X, y) - - @pytest.mark.parametrize( "decomposer_child_class", decomposer_list, diff --git a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py index 2f2f9a049d..a8fc871ed7 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py @@ -98,3 +98,26 @@ def test_polynomial_decomposer_needs_monotonic_index(ts_data): decomposer.fit_transform(X, y_shuffled) expected_errors = ["monotonically", "X must be in an sktime compatible format"] assert any([error in str(exec_info.value) for error in expected_errors]) + + +def test_polynomial_decomposer_get_trend_dataframe_raises_errors( + ts_data, +): + X, _, y = ts_data() + + dec = PolynomialDecomposer() + dec.fit_transform(X, y) + + with pytest.raises( + TypeError, + match="Provided X should have datetimes in the index.", + ): + X_int_index = X.reset_index() + dec.get_trend_dataframe(X_int_index, y) + + with pytest.raises( + ValueError, + match="Provided DatetimeIndex of X should have an inferred frequency.", + ): + X.index.freq = None + dec.get_trend_dataframe(X, y) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 52b9cf532c..eb63663d11 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -447,6 +447,7 @@ def test_unsupported_frequencies( """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ + # period = 7 if variateness == "univariate" else {} X, y = generate_seasonal_data( real_or_synthetic="synthetic", univariate_or_multivariate=variateness, @@ -457,7 +458,38 @@ def test_unsupported_frequencies( stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) - assert stl.period is not None + if variateness == "univariate": + assert stl.period is not None + else: + assert stl.periods is not None + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_init_periods( + generate_seasonal_data, + variateness, +): + """This test exists to highlight that even though the underlying statsmodels STL component won't work + for minute or annual frequencies, we can still run these frequencies with automatic period detection. + """ + period = 7 + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )(period) + periods = {id: 8 for id in y.columns} if variateness == "multivariate" else None + stl = STLDecomposer(period=period, periods=periods) + X_t, y_t = stl.fit_transform(X, y) + if variateness == "univariate": + assert stl.period == period + else: + assert stl.periods == periods @pytest.mark.parametrize( From 3677345a282d0e54f315ba0475536c55ddd9015c Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 22 Aug 2023 12:33:55 -0700 Subject: [PATCH 39/47] add unstacking --- .../transformers/preprocessing/stl_decomposer.py | 5 +++++ .../decomposer_tests/test_stl_decomposer.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index f1139a6036..6751aa36ff 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -12,6 +12,7 @@ from evalml.pipelines.components.transformers.preprocessing.decomposer import Decomposer from evalml.utils import infer_feature_types +from pipelines.utils import unstack_multiseries class STLDecomposer(Decomposer): @@ -179,6 +180,10 @@ def fit( f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) + # If y is a stacked pd.Series, unstack it + if self.series_id is not None and isinstance(y, pd.Series): + X, y = unstack_multiseries(X, y, self.series_id) + if isinstance(y, pd.Series): y = y.to_frame() diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index eb63663d11..8397a66476 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -616,3 +616,13 @@ def test_stl_decomposer_plot_decomposition( assert isinstance(fig, matplotlib.pyplot.Figure) assert isinstance(axs, np.ndarray) assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + + +def test_stl_decomposer_unstack_series_id( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + + dec = STLDecomposer(time_index="date", series_id="series_id") + X_t, y_t = dec.fit_transform(X, y) + assert len(y_t) == len(y["series_id"]) From fd99c53cacf4dd5013b2e213a460f8e6dcb7169e Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 22 Aug 2023 12:49:28 -0700 Subject: [PATCH 40/47] fix import --- .../transformers/preprocessing/stl_decomposer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 6751aa36ff..c7e1a47b92 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -11,8 +11,8 @@ from statsmodels.tsa.seasonal import STL from evalml.pipelines.components.transformers.preprocessing.decomposer import Decomposer +from evalml.pipelines.utils import unstack_multiseries from evalml.utils import infer_feature_types -from pipelines.utils import unstack_multiseries class STLDecomposer(Decomposer): @@ -472,8 +472,9 @@ def get_trend_dataframe(self, X, y): if not isinstance(X.index, pd.DatetimeIndex): X.index = y.index self._check_oos_past(y) + trend = 0 - def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): + def _decompose_target(X, y, fh, trend, seasonal, residual): """Function to generate a single DataFrame with trend, seasonality and residual components.""" if len(y.index) != len(trend.index) or not all( y.index == trend.index, @@ -523,8 +524,6 @@ def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): trend, seasonal, residual, - period, - id, ), ) From dd7b74fb67707047246a5abf05435c31d1a63436 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 22 Aug 2023 13:55:16 -0700 Subject: [PATCH 41/47] add unstacking and test --- .../transformers/preprocessing/stl_decomposer.py | 12 +++++++++--- .../decomposer_tests/test_stl_decomposer.py | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index c7e1a47b92..1c4d36de41 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -11,7 +11,6 @@ from statsmodels.tsa.seasonal import STL from evalml.pipelines.components.transformers.preprocessing.decomposer import Decomposer -from evalml.pipelines.utils import unstack_multiseries from evalml.utils import infer_feature_types @@ -174,6 +173,8 @@ def fit( ValueError: If y is None. ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ + from evalml.pipelines.utils import unstack_multiseries + # Warn for poor decomposition use with higher seasonal smoothers if self.seasonal_smoother > 14: self.logger.warning( @@ -182,7 +183,7 @@ def fit( # If y is a stacked pd.Series, unstack it if self.series_id is not None and isinstance(y, pd.Series): - X, y = unstack_multiseries(X, y, self.series_id) + X, y = unstack_multiseries(X, y, self.series_id, self.time_index, y.name) if isinstance(y, pd.Series): y = y.to_frame() @@ -266,9 +267,15 @@ def transform( Raises: ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ + from evalml.pipelines.utils import unstack_multiseries + if y is None: return X, y + # If y is a stacked pd.Series, unstack it + if self.series_id is not None and isinstance(y, pd.Series): + X, y = unstack_multiseries(X, y, self.series_id, self.time_index, y.name) + if isinstance(y, pd.Series): y = y.to_frame() @@ -472,7 +479,6 @@ def get_trend_dataframe(self, X, y): if not isinstance(X.index, pd.DatetimeIndex): X.index = y.index self._check_oos_past(y) - trend = 0 def _decompose_target(X, y, fh, trend, seasonal, residual): """Function to generate a single DataFrame with trend, seasonality and residual components.""" diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 8397a66476..1ceff6bb52 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -623,6 +623,6 @@ def test_stl_decomposer_unstack_series_id( ): X, y = multiseries_ts_data_stacked - dec = STLDecomposer(time_index="date", series_id="series_id") - X_t, y_t = dec.fit_transform(X, y) - assert len(y_t) == len(y["series_id"]) + dec = STLDecomposer(series_id="series_id", time_index="date") + X_output, y_output = dec.fit_transform(X, y) + assert len(y_output.columns) == X["series_id"].nunique() From d8e23e0531ac649fb6339b83cf800a5ada3bbdbd Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 22 Aug 2023 16:55:55 -0700 Subject: [PATCH 42/47] remove comments --- .../component_tests/decomposer_tests/test_stl_decomposer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 1ceff6bb52..0598b771c7 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -475,9 +475,6 @@ def test_init_periods( generate_seasonal_data, variateness, ): - """This test exists to highlight that even though the underlying statsmodels STL component won't work - for minute or annual frequencies, we can still run these frequencies with automatic period detection. - """ period = 7 X, y = generate_seasonal_data( real_or_synthetic="synthetic", From ecbfe86c9f34dce1edf0a9be8c4ad0b1c16ece1c Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 25 Aug 2023 13:52:40 -0700 Subject: [PATCH 43/47] update periods and tests --- .../preprocessing/stl_decomposer.py | 23 +++-- .../decomposer_tests/test_decomposer.py | 2 - .../decomposer_tests/test_stl_decomposer.py | 96 +++++++++++++++++-- 3 files changed, 99 insertions(+), 22 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 1c4d36de41..80aec5c06d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -209,19 +209,13 @@ def fit( # Determine the period of the seasonal component if id not in self.periods: - period = self.determine_periodicity( - X, - series_y, - acf_threshold=0.01, - rel_max_order=5, + # If the user provides a period for single series, use that + period = ( + self.period + if len(y.columns) == 1 and self.period is not None + else self.determine_periodicity(X, series_y) ) - if self.period is None and len(y.columns) == 1: - self.period = period - self.update_parameters({"period": self.period}) - elif self.period is not None and len(y.columns) == 1: - period = self.period self.periods[id] = period - self.update_parameters({"periods": self.periods}) stl = STL( series_y, @@ -240,6 +234,7 @@ def fit( self.seasonalities[id] = seasonality self.trends[id] = res.trend self.residuals[id] = res.resid + self.update_parameters({"periods": self.periods}) return self @@ -471,7 +466,11 @@ def get_trend_dataframe(self, X, y): """ X = infer_feature_types(X) - + if not isinstance(X.index, pd.DatetimeIndex) and not isinstance( + y.index, + pd.DatetimeIndex, + ): + raise TypeError("Provided X or y should have datetimes in the index.") # Change the y index to a matching datetimeindex or else we get a failure # in ForecastingHorizon during decomposition. if not isinstance(y.index, pd.DatetimeIndex): diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index c50c8193ab..9979dd9eb4 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -580,7 +580,6 @@ def test_decomposer_get_trend_dataframe_error_not_fit( decomposer_child_class, ts_data, ts_multiseries_data, - multiseries_ts_data_unstacked, variateness, fit_before_decompose, ): @@ -592,7 +591,6 @@ def test_decomposer_get_trend_dataframe_error_not_fit( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) X, _, y = ts_multiseries_data() - # X, y = multiseries_ts_data_unstacked dec = decomposer_child_class(time_index="date") if fit_before_decompose: dec.fit_transform(X, y) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 0598b771c7..1d2cd2aa6b 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -76,20 +76,34 @@ def test_stl_raises_warning_high_smoother( (40, "M"), ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_sets_determined_period( period, freq, generate_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period, freq_str=freq, ) stl = STLDecomposer() stl.fit(X, y) + if isinstance(y, pd.Series): + y = y.to_frame() # Allow for a slight margin of error with detection - assert period * 0.99 <= stl.period <= period * 1.01 + for id in y.columns: + assert period * 0.99 <= stl.periods[id] <= period * 1.01 @pytest.mark.parametrize( @@ -397,7 +411,77 @@ def test_stl_decomposer_get_trend_dataframe( "multivariate", ], ) -def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( +def test_stl_decomposer_get_trend_dataframe_raises_errors( + variateness, + generate_seasonal_data, +): + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) + + stl = STLDecomposer() + stl.fit_transform(X, y) + + with pytest.raises( + TypeError, + match="Provided X or y should have datetimes in the index.", + ): + X_int_index = X.reset_index() + y_int_index = y.reset_index() + stl.get_trend_dataframe(X_int_index, y_int_index) + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_get_trend_dataframe_sets_X_index_internally( + variateness, + generate_seasonal_data, +): + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) + + X = X.reset_index() + assert not isinstance(X.index, pd.DatetimeIndex) + + stl = STLDecomposer() + stl.fit(X, y) + result_dfs = stl.get_trend_dataframe(X, y) + + if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) + for df in result_dfs + ) + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_get_trend_dataframe_sets_y_index_internally( generate_seasonal_data, variateness, ): @@ -447,7 +531,6 @@ def test_unsupported_frequencies( """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ - # period = 7 if variateness == "univariate" else {} X, y = generate_seasonal_data( real_or_synthetic="synthetic", univariate_or_multivariate=variateness, @@ -458,10 +541,7 @@ def test_unsupported_frequencies( stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) - if variateness == "univariate": - assert stl.period is not None - else: - assert stl.periods is not None + assert stl.periods is not None @pytest.mark.parametrize( From cee84ead9268ccaae0c87a1dfa978c769e9d07ba Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 25 Aug 2023 14:29:54 -0700 Subject: [PATCH 44/47] set y index --- .../decomposer_tests/test_stl_decomposer.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 1d2cd2aa6b..1e70542d94 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -444,16 +444,11 @@ def test_stl_decomposer_get_trend_dataframe_raises_errors( ) def test_stl_decomposer_get_trend_dataframe_sets_X_index_internally( variateness, - generate_seasonal_data, + ts_data, + ts_multiseries_data, ): - X, y = generate_seasonal_data( - real_or_synthetic="synthetic", - univariate_or_multivariate=variateness, - )( - period=7, - set_time_index=False, - ) - + X, _, y = ts_data() if variateness == "univariate" else ts_multiseries_data() + assert isinstance(y.index, pd.DatetimeIndex) X = X.reset_index() assert not isinstance(X.index, pd.DatetimeIndex) From aa5a95a98b7445f748daa2a2f73c908af852c7ab Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 25 Aug 2023 16:42:10 -0700 Subject: [PATCH 45/47] simplify get_trend_dataframe --- .../transformers/preprocessing/decomposer.py | 10 +-- .../preprocessing/stl_decomposer.py | 80 +++++++------------ .../decomposer_tests/test_stl_decomposer.py | 39 ++++----- 3 files changed, 51 insertions(+), 78 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index d93ae6353a..5bc0edb6e4 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -359,14 +359,14 @@ def plot_decomposition( if len(y.columns) > 1: results = decomposition_results[id] else: - results = decomposition_results - axs[0].plot(results[0]["signal"], "r") + results = decomposition_results[0] + axs[0].plot(results["signal"], "r") axs[0].set_title("signal") - axs[1].plot(results[0]["trend"], "b") + axs[1].plot(results["trend"], "b") axs[1].set_title("trend") - axs[2].plot(results[0]["seasonality"], "g") + axs[2].plot(results["seasonality"], "g") axs[2].set_title("seasonality") - axs[3].plot(results[0]["residual"], "y") + axs[3].plot(results["residual"], "y") axs[3].set_title("residual") # If multiseries, return a dictionary of tuples diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 80aec5c06d..846be5a82b 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -479,65 +479,47 @@ def get_trend_dataframe(self, X, y): X.index = y.index self._check_oos_past(y) - def _decompose_target(X, y, fh, trend, seasonal, residual): + def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) != len(trend.index) or not all( - y.index == trend.index, + if isinstance(y, pd.Series): + y = y.to_frame() + if all( + len(y.index) == len(self.trends[id].index) + and all( + y.index == self.trends[id].index, + ) + for id in y.columns ): # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, - period=period, + periods=self.periods, ) decomposer.fit(X, y) - trend = decomposer.trends[id] - seasonal = decomposer.seasonals[id] - residual = decomposer.residuals[id] - return pd.DataFrame( - { - "signal": y, - "trend": trend, - "seasonality": seasonal, - "residual": residual, - }, - ) - - if isinstance(y, pd.Series): - y = y.to_frame() - series_results = {} - # Iterate through each series id - for id in y.columns: - result_dfs = [] - if len(y.columns) > 1: - seasonal = self.seasonals[id] - trend = self.trends[id] - residual = self.residuals[id] - period = self.periods[id] + trend = decomposer.trends + seasonal = decomposer.seasonals + residual = decomposer.residuals else: - seasonal = list(self.seasonals.values())[0] - trend = list(self.trends.values())[0] - residual = list(self.residuals.values())[0] - period = list(self.periods.values())[0] - - series_y = y[id] - if isinstance(series_y, pd.Series): - result_dfs.append( - _decompose_target( - X, - series_y, - None, - trend, - seasonal, - residual, - ), + trend = self.trends + seasonal = self.seasonals + residual = self.residuals + result_dict = {} + for id in y.columns: + df = pd.DataFrame( + { + "signal": y[id], + "trend": trend[id], + "seasonality": seasonal[id], + "residual": residual[id], + }, ) + if len(y.columns) == 1: + return [df] + else: + result_dict[id] = df + return result_dict - series_results[id] = result_dfs - - # only return the dictionary if single series - if len(y.columns) <= 1: - return result_dfs - return series_results + return _decompose_target(X, y, None) def get_trend_prediction_intervals(self, y, coverage=None): """Calculate the prediction intervals for the trend data. diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 1e70542d94..847e132253 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -342,16 +342,12 @@ def test_stl_decomposer_get_trend_dataframe( elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs + get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs ) assert len(result_dfs) == 2 - [ - (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) - for df in result_dfs - ] + [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] elif transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -388,20 +384,13 @@ def test_stl_decomposer_get_trend_dataframe( [get_trend_dataframe_format_correct(x) for x in result_dfs] elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs - ) - assert all( - (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) - for df in result_dfs + get_trend_dataframe_format_correct(result_dfs[x]) + for x in result_dfs ) assert len(result_dfs) == 2 - [ - (get_trend_dataframe_format_correct(x) for x in result_dfs[df]) - for df in result_dfs - ] + [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] @pytest.mark.parametrize( @@ -462,11 +451,12 @@ def test_stl_decomposer_get_trend_dataframe_sets_X_index_internally( assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs + get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs ) + assert len(result_dfs) == 2 + [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] @pytest.mark.parametrize( @@ -500,11 +490,12 @@ def test_stl_decomposer_get_trend_dataframe_sets_y_index_internally( assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) assert all( - all(isinstance(x, pd.DataFrame) for x in result_dfs[df]) - for df in result_dfs + get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs ) + assert len(result_dfs) == 2 + [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] @pytest.mark.parametrize( From b9e9d4518e5f12733c824de993c83ac03ca44a41 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 30 Aug 2023 13:37:39 -0700 Subject: [PATCH 46/47] change type to dict(list(df)) --- .../transformers/preprocessing/decomposer.py | 2 +- .../preprocessing/stl_decomposer.py | 2 +- .../decomposer_tests/test_stl_decomposer.py | 33 ++++++++++++------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 5bc0edb6e4..10222b3fc7 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -357,7 +357,7 @@ def plot_decomposition( fig.set_size_inches(18.5, 14.5) if len(y.columns) > 1: - results = decomposition_results[id] + results = decomposition_results[id][0] else: results = decomposition_results[0] axs[0].plot(results["signal"], "r") diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 846be5a82b..b4bcfdd029 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -516,7 +516,7 @@ def _decompose_target(X, y, fh): if len(y.columns) == 1: return [df] else: - result_dict[id] = df + result_dict[id] = [df] return result_dict return _decompose_target(X, y, None) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index 847e132253..6e17067d59 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -342,12 +342,13 @@ def test_stl_decomposer_get_trend_dataframe( elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) assert all( - get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs ) assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] elif transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -384,13 +385,19 @@ def test_stl_decomposer_get_trend_dataframe( [get_trend_dataframe_format_correct(x) for x in result_dfs] elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) assert all( - get_trend_dataframe_format_correct(result_dfs[x]) + isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs + ) + assert all( + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs ) assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] + [ + get_trend_dataframe_format_correct(result_dfs[x][0]) + for x in result_dfs + ] @pytest.mark.parametrize( @@ -451,12 +458,13 @@ def test_stl_decomposer_get_trend_dataframe_sets_X_index_internally( assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) assert all( - get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs ) assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] @pytest.mark.parametrize( @@ -490,12 +498,13 @@ def test_stl_decomposer_get_trend_dataframe_sets_y_index_internally( assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) elif variateness == "multivariate": assert isinstance(result_dfs, dict) - assert all(isinstance(result_dfs[x], pd.DataFrame) for x in result_dfs) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) assert all( - get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs ) assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(result_dfs[x]) for x in result_dfs] + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] @pytest.mark.parametrize( From 3cc6cf3c253a5ab75418549ecf43841a2103e528 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 31 Aug 2023 14:04:28 -0700 Subject: [PATCH 47/47] update notes --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index f44a85c98a..881451efa2 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Extended STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Changes * Documentation Changes @@ -16,7 +17,6 @@ Release Notes * Enhancements * Added support for prediction intervals for VARMAX regressor :pr:`4267` * Integrated multiseries time series into AutoMLSearch :pr:`4270` - * Extended STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Fixed error when stacking data with no exogenous variables :pr:`4275` * Changes