Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for additional estimators for multiseries datasets #4385

Merged
merged 10 commits into from
Jan 31, 2024
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added support for additional estimators for multiseries datasets :pr:`4385`
* Fixes
* Fixed bug in `_downcast_nullable_y` causing woodwork initialization issues :pr:`4369`
* Fixed multiseries prediction interval labels :pr:`4377`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,12 @@ class CatBoostRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ class DecisionTreeRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ class ElasticNetRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ class ExtraTreesRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ class LightGBMRegressor(Estimator):
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
]
"""[ProblemTypes.REGRESSION]"""
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
]"""

SEED_MIN = 0
SEED_MAX = SEED_BOUNDS.max_bound
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ class LinearRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(self, fit_intercept=True, n_jobs=-1, random_seed=0, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ class RandomForestRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ class XGBoostRegressor(Estimator):
supported_problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

# xgboost supports seeds from -2**31 to 2**31 - 1 inclusive. these limits ensure the random seed generated below
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Transformer to drop rows specified by row indices."""
import pandas as pd
from woodwork import init_series

from evalml.pipelines.components.transformers import Transformer
Expand Down Expand Up @@ -43,12 +44,25 @@ def transform(self, X, y=None):
y_t = infer_feature_types(y) if y is not None else None

X_t_schema = X_t.ww.schema
y_t_logical = None
y_t_semantic = None
if y_t is not None:
y_t_logical = y_t.ww.logical_type
y_t_semantic = y_t.ww.semantic_tags
if isinstance(y_t, pd.DataFrame):
y_t_logical = y_t.ww.logical_types
y_t_semantic = y_t.ww.semantic_tags
else:
y_t_logical = y_t.ww.logical_type
y_t_semantic = y_t.ww.semantic_tags
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved

X_t, y_t = drop_rows_with_nans(X_t, y_t)
X_t.ww.init_with_full_schema(X_t_schema)
if y_t is not None:
y_t = init_series(y_t, logical_type=y_t_logical, semantic_tags=y_t_semantic)
if isinstance(y_t, pd.DataFrame):
y_t.ww.init(logical_types=y_t_logical, semantic_tags=y_t_semantic)
else:
y_t = init_series(
y_t,
logical_type=y_t_logical,
semantic_tags=y_t_semantic,
)
return X_t, y_t
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,16 @@ def fit(self, X, y=None):

# For the multiseries case, where we only want the start delay lag for the baseline
if isinstance(y, pd.DataFrame):
self.statistically_significant_lags = [self.start_delay]
self.statistically_significant_lags = {}
for column in y.columns:
self.statistically_significant_lags[
column
] = self._find_significant_lags(
y[column],
conf_level=self.conf_level,
start_delay=self.start_delay,
max_delay=self.max_delay,
)
Comment on lines +141 to +150
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make this section more concise/easier to maintain by folding the single series case into the multiseries case by converting the single series to a dataframe and keeping this code for both cases - following the pattern in other files that already support multiseries (stl decomposer might be a good example?)

Copy link
Contributor Author

@christopherbunn christopherbunn Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I just consolidated it.

Actually I just remembered that it's structured this was so that we're still able to run self._find_significant_lags even when y is None. Is there a way you had in mind to structure it so that y can still be None?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, seems like y being None is something we'd want to have explicit behavior for, since right now the behavior is unclear. I think we should just handle it entirely separately

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We handle the y being null in self._find_significant_lags since we calculate all lags in that function (and just set significant lags to be all_lags if y is None). Should I put it off into it's own separate branch, even though the code would be identical to the case where y is a series? e.g.

        # For the multiseries case, each series ID has individualized lag values
        if isinstance(y, pd.DataFrame):
            self.statistically_significant_lags = {}
            for column in y.columns:
                self.statistically_significant_lags[
                    column
                ] = self._find_significant_lags(
                    y[column],
                    conf_level=self.conf_level,
                    start_delay=self.start_delay,
                    max_delay=self.max_delay,
                )
        elif y is None:
            self.statistically_significant_lags = self._find_significant_lags(
                y,
                conf_level=self.conf_level,
                start_delay=self.start_delay,
                max_delay=self.max_delay,
            )
        else:
            self.statistically_significant_lags = self._find_significant_lags(
                y,
                conf_level=self.conf_level,
                start_delay=self.start_delay,
                max_delay=self.max_delay,
            )

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, sorry for drilling into this so much, but I think I understand now. My new potentially hot take proposal is something like:

if y is None:
    self.statistically_significant_lags = np.arange(self.start_delay, self.start_delay +self. max_delay + 1)
else:
    if isinstance(y, pd.Series): 
        y = y.to_frame()
    for column in y.columns:
        self.statistically_significant_lags = ...

And then we can remove the handling of y being None from the static function. My argument for doing this is that calling all lags the statistically significant lags is a misnomer, since we didn't actually check statistical significance. This is me getting very into the weeds though, so I very much understand if you would rather keep things closer to the way they are 😅

Regardless, even with your new proposal, we'd still be able to combine the two non y=None cases by casting the series to a dataframe

Copy link
Contributor Author

@christopherbunn christopherbunn Jan 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your example makes sense to me, I don't see our behavior for y is None changing anytime soon so I'm comfortable with pulling that out and changing the function. Will update!

else:
self.statistically_significant_lags = self._find_significant_lags(
y,
Expand Down Expand Up @@ -234,7 +243,25 @@ def _delay_df(
col = data[col_name]
if categorical_columns and col_name in categorical_columns:
col = X_categorical[col_name]
for t in self.statistically_significant_lags:
# Lags are stored in a dict for multiseries problems
# Returns the lags corresponding to the series ID value
if isinstance(self.statistically_significant_lags, dict):
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

col_series_id = (
MULTISERIES_SEPARATOR_SYMBOL
+ col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1]
)
for (
series_id_target_name,
lag_list,
) in self.statistically_significant_lags.items():
if series_id_target_name.endswith(col_series_id):
lags = lag_list
break
else:
lags = self.statistically_significant_lags
for t in lags:
lagged_features[self.df_colname_prefix.format(col_name, t)] = col.shift(
t,
)
Expand Down
41 changes: 35 additions & 6 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def _fit(self, X, y):

self.component_graph.fit(X_unstacked, y_unstacked)
self.input_feature_names = self.component_graph.input_feature_names
self.series_id_target_names = y_unstacked.columns

def predict_in_sample(
self,
Expand Down Expand Up @@ -144,7 +145,7 @@ def predict_in_sample(
]
y_overlapping_features = [
feature
for feature in y_train_unstacked.columns
for feature in self.series_id_target_names
if feature in y_unstacked.columns
]
y_unstacked = y_unstacked[y_overlapping_features]
Expand All @@ -154,7 +155,6 @@ def predict_in_sample(
y_train_unstacked = infer_feature_types(y_train_unstacked)
X_unstacked = infer_feature_types(X_unstacked)
y_unstacked = infer_feature_types(y_unstacked)

unstacked_predictions = super().predict_in_sample(
X_unstacked,
y_unstacked,
Expand All @@ -163,16 +163,45 @@ def predict_in_sample(
objective,
calculating_residuals,
)
unstacked_predictions = unstacked_predictions[
[
series_id_target
for series_id_target in y_train_unstacked.columns
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved
if series_id_target in unstacked_predictions.columns
]
]
unstacked_predictions.index = X_unstacked[self.time_index]
stacked_predictions = stack_data(
unstacked_predictions,
include_series_id=include_series_id,
include_series_id=True,
series_id_name=self.series_id,
)

stacked_predictions = stacked_predictions.reset_index()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reasoning behind setting the index and then immediately resetting the index? The value of the index shouldn't impact the order of stacking, right?

Either way, we can explicitly control the index in stack_data with the starting_index argument

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goal of this snippet is to set the index as the time index column, stack the data (and thus using the dates in the time index column to generate new stacked dates) and then resetting the index so that the resulting time index column can be used when we pd.merge later on in line 193.

While it's possible to just copy over the time_index column from X after stacking, I think it's safer to just generate it from the X_unstacked index like this as we know for sure that the X_unstacked time_index aligns with the unstacked_predictions whereas it's technically possible to have an X time_index that's out of order (and thus would be incorrect if we simply copied over this column). I'm open to suggestions for a cleaner implementation!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think I understand now! I think a comment would be great. I also wonder if it would be useful to explicitly say reset_index(drop=False), so that even if pandas changes their defaults we don't get screwed.

My motivation here is that this is something that might be confusing to someone looking back at it in the future, since the goal isn't clear from the code itself. I hope that makes sense!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call on the reset index parameter, I'll add that in. I'll add a clarifying comment or two so that it's clear what's going on here.

Your motivation makes sense! I feel like I've been so lost in the weeds of this implementation for a while now so it's good to have multiple pairs of eyes on this to highlight what's intuitive and what isn't 😅

sp_dtypes = {
self.time_index: X[self.time_index].dtype,
self.series_id: X[self.series_id].dtype,
self.input_target_name: y.dtype,
}
stacked_predictions = stacked_predictions.astype(sp_dtypes)

# Order prediction based on input (date, series_id)
output_cols = (
[self.series_id, self.input_target_name]
if include_series_id
else [self.input_target_name]
)
stacked_predictions = pd.merge(
X,
stacked_predictions,
on=[self.time_index, self.series_id],
)[output_cols]
# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
stacked_predictions = infer_feature_types(stacked_predictions)
return stacked_predictions

if not include_series_id:
return infer_feature_types(stacked_predictions[self.input_target_name])
else:
return infer_feature_types(stacked_predictions)

def get_forecast_period(self, X):
"""Generates all possible forecasting time points based on latest data point in X.
Expand Down
25 changes: 13 additions & 12 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def _get_datetime(X, y, problem_type, estimator_class, sampler_name=None):
if add_datetime_featurizer and estimator_class.model_family not in [
ModelFamily.ARIMA,
ModelFamily.PROPHET,
ModelFamily.VARMAX,
]:
components.append(DateTimeFeaturizer)
return components
Expand Down Expand Up @@ -298,13 +299,7 @@ def _get_preprocessing_components(
Returns:
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
if is_multiseries(problem_type):
if include_decomposer:
components_functions = [_get_decomposer]
else:
return []

elif is_time_series(problem_type):
if is_time_series(problem_type):
components_functions = [
_get_label_encoder,
_get_drop_all_null,
Expand Down Expand Up @@ -1508,22 +1503,28 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
time_index (str): The name of the time index column.
starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index
will match that of the input data. Defaults to None.
series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will
series_id_values (list): The unique values of a series ID, used to generate the index. If None, values will
be generated from X column values. Required if X only has time index values and no exogenous values.
Defaults to None.

Returns:
pd.DataFrame: The restacked features.
"""
original_columns = set()
series_ids = series_id_values or set()
if series_id_values is None:
if series_id_values is not None:
series_ids = series_id_values
else:
# Using list to maintain order (vs. a set)
series_ids = list()
for col in X.columns:
if col == time_index:
continue
separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))
series_ids.add(separated_name[-1])
series_ids.append(separated_name[-1])
# Remove duplicates
seen = set()
series_ids = [val for val in series_ids if not (val in seen or seen.add(val))]
eccabay marked this conversation as resolved.
Show resolved Hide resolved

if len(series_ids) == 0:
raise ValueError(
Expand All @@ -1542,7 +1543,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
restacked_X = pd.DataFrame(
{
time_index: time_index_col,
series_id_name: sorted(list(series_ids)) * len(X),
series_id_name: list(series_ids) * len(X),
},
index=stacked_index,
)
Expand Down
2 changes: 1 addition & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs):

# Get unique series value from X if there is only the time_index column
# Otherwise, this information is generated in `stack_X` from the column values
series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None
series_id_values = X[series_id].unique() if len(X_unstacked.columns) == 1 else None
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved

X_train = stack_X(
X_train_unstacked,
Expand Down
6 changes: 3 additions & 3 deletions evalml/tests/automl_tests/test_default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@
)

first_batch = algo.next_batch()
assert len(first_batch) == 2
assert len(first_batch) == 8

Check warning on line 673 in evalml/tests/automl_tests/test_default_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_default_algorithm.py#L673

Added line #L673 was not covered by tests
pipeline = first_batch[0]
assert pipeline.model_family == ModelFamily.VARMAX
assert pipeline.parameters["pipeline"] == search_parameters["pipeline"]
Expand All @@ -679,8 +679,8 @@

long_explore = algo.next_batch()
long_estimators = set([pipeline.estimator.name for pipeline in long_explore])
assert len(long_explore) == 100
assert len(long_estimators) == 1
assert len(long_explore) == 300
assert len(long_estimators) == 3

Check warning on line 683 in evalml/tests/automl_tests/test_default_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_default_algorithm.py#L682-L683

Added lines #L682 - L683 were not covered by tests


@pytest.mark.parametrize(
Expand Down
8 changes: 4 additions & 4 deletions evalml/tests/automl_tests/test_iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
DateTimeFeaturizer,
EmailFeaturizer,
NaturalLanguageFeaturizer,
STLDecomposer,
TimeSeriesFeaturizer,
URLFeaturizer,
)
Expand Down Expand Up @@ -98,19 +97,20 @@
assert algo.batch_number == 0
assert algo.default_max_batches == 1
estimators = get_estimators(problem_type)
decomposer = [STLDecomposer] if is_regression(problem_type) else []
decomposer = [True, False] if is_regression(problem_type) else [True]

Check warning on line 100 in evalml/tests/automl_tests/test_iterative_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_iterative_algorithm.py#L100

Added line #L100 was not covered by tests
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment clarifying why you're using true false instead of the decomposer name?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to just parametrize the include_decomposer argument here - this and the below section are confusing to read out of context

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this test, we are basically only checking the number of pipelines matches up. Before, we only needed to add the decomposer one once since there was one estimator type (VARMAX).

Now, since we have multiple estimator types, each estimator type will have one pipeline with a decomposer and another without a decomposer. As such, we need to have this [True, False] and to iterate through it in order to generate the correct number of pipelines.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a clarifying comment would be useful here 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a short one to this test

assert len(algo.allowed_pipelines) == len(
[
make_pipeline(
X,
y,
estimator,
problem_type,
include_decomposer=include_decomposer,
parameters=search_parameters,
)
for estimator in estimators
]
+ decomposer,
for include_decomposer in decomposer
],
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def test_problem_types():
assert set(DecisionTreeRegressor.supported_problem_types) == {
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
}


Expand Down
Loading
Loading