From 4f407d34830537b684fb9068814d012580e8893f Mon Sep 17 00:00:00 2001 From: Yoni Shelach <92271540+yonishelach@users.noreply.github.com> Date: Sun, 22 Jan 2023 11:30:03 +0200 Subject: [PATCH] [Feature Store] Fix `Imputer`'s `None` types check (#2941) --- mlrun/feature_store/steps.py | 6 +++--- tests/feature-store/test_steps.py | 36 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/mlrun/feature_store/steps.py b/mlrun/feature_store/steps.py index 43459263a2..3460a7704d 100644 --- a/mlrun/feature_store/steps.py +++ b/mlrun/feature_store/steps.py @@ -272,12 +272,12 @@ def __init__( :param kwargs: optional kwargs (for storey) """ super().__init__(**kwargs) - self.mapping = mapping + self.mapping = mapping or {} self.method = method self.default_value = default_value - def _impute(self, feature: str, value): - if value is None: + def _impute(self, feature: str, value: Any): + if pd.isna(value): return self.mapping.get(feature, self.default_value) return value diff --git a/tests/feature-store/test_steps.py b/tests/feature-store/test_steps.py index 1590b2ec61..0d92511c6f 100644 --- a/tests/feature-store/test_steps.py +++ b/tests/feature-store/test_steps.py @@ -16,6 +16,7 @@ import time import unittest.mock +import numpy as np import pandas as pd import pytest @@ -616,6 +617,41 @@ def test_pandas_step_drop_feature(rundb_mock, entities, set_index_before): ) +@pytest.mark.parametrize("engine", ["storey", "pandas"]) +def test_imputer_default_value(rundb_mock, engine): + data_with_nones = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "height": [None, 160, pd.NA, np.nan], + "age": [20, pd.NaT, 19, 18], + } + ) + # Building graph with Imputer: + feature_set = fstore.FeatureSet( + "fs-default-value", + entities=["id"], + description="feature set with nones", + engine=engine, + ) + feature_set.graph.to(Imputer(default_value=1)) + + # Mocking + output_path = tempfile.TemporaryDirectory() + feature_set._run_db = rundb_mock + feature_set.reload = unittest.mock.Mock() + feature_set.save = unittest.mock.Mock() + feature_set.purge_targets = unittest.mock.Mock() + + imputed_df = fstore.ingest( + featureset=feature_set, + source=data_with_nones, + targets=[ParquetTarget(path=f"{output_path.name}/temp.parquet")], + ) + + # Checking that the ingested dataframe is none-free: + assert not imputed_df.isnull().values.any() + + def get_data(with_none=False): names = ["A", "B", "C", "D", "E"] ages = [33, 4, 76, 90, 24]