From aab161db1f7caf0eb6fe619e5e0326637d6e936f Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Mon, 2 Dec 2024 17:20:27 +0100 Subject: [PATCH 1/5] WIP --- feature_engine/encoding/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/feature_engine/encoding/__init__.py b/feature_engine/encoding/__init__.py index 3c689b7ad..4044e6df5 100644 --- a/feature_engine/encoding/__init__.py +++ b/feature_engine/encoding/__init__.py @@ -10,6 +10,7 @@ from .rare_label import RareLabelEncoder from .similarity_encoder import StringSimilarityEncoder from .woe import WoEEncoder +from .pandas_categorical import PandasCategoricalEncoder __all__ = [ "CountFrequencyEncoder", @@ -20,4 +21,5 @@ "RareLabelEncoder", "StringSimilarityEncoder", "WoEEncoder", + "PandasCategoricalEncoder", ] From a53b8dea62a5bc7db197c885d0bea009d7b257fc Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Mon, 2 Dec 2024 17:20:36 +0100 Subject: [PATCH 2/5] WIP --- feature_engine/encoding/pandas_categorical.py | 195 ++++++++++++++++++ .../test_pandas_categorical_encoder.py | 96 +++++++++ 2 files changed, 291 insertions(+) create mode 100644 feature_engine/encoding/pandas_categorical.py create mode 100644 tests/test_encoding/test_pandas_categorical_encoder.py diff --git a/feature_engine/encoding/pandas_categorical.py b/feature_engine/encoding/pandas_categorical.py new file mode 100644 index 000000000..2097e9885 --- /dev/null +++ b/feature_engine/encoding/pandas_categorical.py @@ -0,0 +1,195 @@ +from typing import List, Optional, Union + +import pandas as pd + +from feature_engine._docstrings.fit_attributes import ( + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.init_parameters.all_trasnformers import ( + _missing_values_docstring, + _variables_categorical_docstring, +) +from feature_engine._docstrings.init_parameters.encoders import ( + _ignore_format_docstring, + _unseen_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_transform_docstring, + _inverse_transform_docstring, + _transform_encoders_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.dataframe_checks import check_X +from feature_engine.encoding._helper_functions import check_parameter_unseen +from feature_engine.encoding.base_encoder import ( + CategoricalInitMixinNA, + CategoricalMethodsMixin, +) +from feature_engine.dataframe_checks import ( + _check_optional_contains_na,) + + +@Substitution( + missing_values=_missing_values_docstring, + ignore_format=_ignore_format_docstring, + variables=_variables_categorical_docstring, + unseen=_unseen_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit_transform=_fit_transform_docstring, + transform=_transform_encoders_docstring, + inverse_transform=_inverse_transform_docstring, +) +class PandasCategoricalEncoder(CategoricalInitMixinNA, CategoricalMethodsMixin): + """Transform columns into pandas categorical type columns. + + Simply applying pandas.to_categorical() separately on train and test set + will not guarantee that each category are encoded in the same way in both datasets. + + This class addresses this problem by making sure that categories are encoded + consistently between train and test set. + + When `unseen="ignore"` unseen categories encountered during transform are + transformed to NAN when the unseen parameter and will have an associated encoded + value of -1. + + Parameters + ---------- + + {variables} + + {missing_values} + + {ignore_format} + + {unseen} + + Attributes + ---------- + encoder_dict_: + Dictionary with the ordinal number per category, per variable. + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + fit: + Find the integer to replace each category in each variable. + + {fit_transform} + + {inverse_transform} + + {transform} + + Notes + ----- + NAN are introduced when encoding categories that were not present in the training + dataset. If this happens, try grouping infrequent categories using the + RareLabelEncoder(). + + See Also + -------- + feature_engine.encoding.RareLabelEncoder + category_encoders.ordinal.OrdinalEncoder + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.encoding import PandasCategoricalEncoder + >>> X = pd.DataFrame(dict(x1 = [1,2,3,4], x2 = ["c", "a", "b", "c"])) + >>> y = pd.Series([0,1,1,0]) + >>> pandas_cat_encoder = PandasCategoricalEncoder() + >>> pandas_cat_encoder.fit(X) + >>> X_transformed = pandas_cat_encoder.transform(X) + >>> X_transformed + x1 x2 + 0 1 c + 1 2 a + 2 3 b + 3 4 c + >>> X_transformed.dtypes + x1 int64 + x2 category + dtype: object + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + missing_values: str = "raise", + ignore_format: bool = False, + unseen: str = "ignore", + ) -> None: + + check_parameter_unseen(unseen, ["ignore", "raise"]) + super().__init__(variables, missing_values, ignore_format) + self.unseen = unseen + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """Learn the numbers to be used to replace the categories in each + variable. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. Can be the entire dataframe, not just the + variables to be encoded. + + y: pandas series, default=None + The Target. Can be None if `encoding_method='arbitrary'`. + Otherwise, y needs to be passed when fitting the transformer. + """ + + X = check_X(X) + + variables_ = self._check_or_select_variables(X) + self._check_na(X, variables_) + + self.encoder_dict_ = {} + for feature in variables_: + self.encoder_dict_[feature] = sorted( + [val for val in X[feature].unique() if pd.notnull(val)] + ) + + if self.unseen == "encode": + self._unseen = -1 + + # assign underscore parameters at the end in case code above fails + self.variables_ = variables_ + self._get_feature_names_in(X) + return self + + def transform(self, X): + """ + Transforms the specified columns in the DataFrame to categorical dtype. + + Args: + X (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame with specified columns converted to categorical + dtype. + """ + X = self._check_transform_input_and_state(X) + # check if dataset contains na + if self.missing_values == "raise": + _check_optional_contains_na(X, self.variables_) + + for feature in self.encoder_dict_.keys(): + X[feature] = pd.Categorical( + X[feature], categories=self.encoder_dict_[feature] + ) + + if self.unseen == "raise": + self._check_nan_values_after_transformation(X) + + return X diff --git a/tests/test_encoding/test_pandas_categorical_encoder.py b/tests/test_encoding/test_pandas_categorical_encoder.py new file mode 100644 index 000000000..ba4930ac2 --- /dev/null +++ b/tests/test_encoding/test_pandas_categorical_encoder.py @@ -0,0 +1,96 @@ +import pandas as pd + +from feature_engine.encoding import PandasCategoricalEncoder + +def test_fit_with_specified_variables(): + """ + Test fitting the transformer with specified variables. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A"]) + transformer.fit(df) + + assert transformer.variables == ["A"] + assert transformer.encoder_dict_ == {"A": ["a", "b", "c"]} + + +def test_fit_with_all_object_variables(): + """ + Test fitting the transformer with all object variables. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder() + transformer.fit(df) + + assert transformer.variables == ["A", "B"] + assert transformer.encoder_dict_ == {"A": ["a", "b", "c"], "B": ["x", "y", "z"]} + + +def test_transform(): + """ + Test transforming the dataframe with the fitted transformer. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformer.fit(df) + transformed_df = transformer.transform(df) + + assert transformed_df["A"].dtype.name == "category" + assert transformed_df["B"].dtype.name == "category" + assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] + assert list(transformed_df["B"].cat.categories) == ["x", "y", "z"] + + +def test_transform_with_unseen_data(): + """ + Test transforming the dataframe with unseen data. + """ + df_train = pd.DataFrame({"A": ["a", "c", "b", "a"], "B": ["x", "y", "x", "z"]}) + df_test = pd.DataFrame({"A": ["a", "b", "c", "d"], "B": ["x", "y", "z", "w"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformed_train_df = transformer.fit_transform(df_train) + transformed_test_df = transformer.transform(df_test) + + assert transformed_test_df["A"].dtype.name == "category" + assert transformed_test_df["B"].dtype.name == "category" + assert list(transformed_test_df["A"].cat.categories) == ["a", "b", "c"] + assert list(transformed_test_df["B"].cat.categories) == ["x", "y", "z"] + assert transformed_test_df["A"].isnull().tolist() == [False, False, False, True] + assert transformed_test_df["B"].isnull().tolist() == [False, False, False, True] + + # Check that the category codes are consistent between the training and test sets + # Expected codes: a=0, b=1, c=2, d=-1 + assert transformed_train_df["A"].cat.codes.tolist() == [0, 2, 1, 0] + assert transformed_test_df["A"].cat.codes.tolist() == [0, 1, 2, -1] + + +def test_transform_with_missing_values(): + """ + Test transforming the dataframe with missing values. + """ + df = pd.DataFrame({"A": ["a", "b", None, "c"], "B": ["x", None, "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformer.fit(df) + transformed_df = transformer.transform(df) + + assert transformed_df["A"].dtype.name == "category" + assert transformed_df["B"].dtype.name == "category" + assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] + assert list(transformed_df["B"].cat.categories) == ["x", "z"] + assert transformed_df["A"].isnull().sum() == 1 + assert transformed_df["B"].isnull().sum() == 1 + + +def test_fit_transform(): + """ + Test the fit_transform method. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformed_df = transformer.fit_transform(df) + + assert transformed_df["A"].dtype.name == "category" + assert transformed_df["B"].dtype.name == "category" + assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] + assert list(transformed_df["B"].cat.categories) == ["x", "y", "z"] + From 5b94c548954d5e48dce78714f73785d7ca3a153c Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:08:43 +0100 Subject: [PATCH 3/5] Refactor PandasCategoricalEncoder to include inverse_transform method --- feature_engine/encoding/pandas_categorical.py | 43 +++++++++++++--- .../test_pandas_categorical_encoder.py | 51 +++++++++++++++---- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/feature_engine/encoding/pandas_categorical.py b/feature_engine/encoding/pandas_categorical.py index 2097e9885..7b4ff4ded 100644 --- a/feature_engine/encoding/pandas_categorical.py +++ b/feature_engine/encoding/pandas_categorical.py @@ -28,7 +28,8 @@ CategoricalMethodsMixin, ) from feature_engine.dataframe_checks import ( - _check_optional_contains_na,) + _check_optional_contains_na, +) @Substitution( @@ -156,9 +157,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.encoder_dict_ = {} for feature in variables_: - self.encoder_dict_[feature] = sorted( - [val for val in X[feature].unique() if pd.notnull(val)] - ) + self.encoder_dict_[feature] = { + category: index + for index, category in enumerate( + sorted([val for val in X[feature].unique() if pd.notnull(val)]) + ) + } if self.unseen == "encode": self._unseen = -1 @@ -184,12 +188,39 @@ def transform(self, X): if self.missing_values == "raise": _check_optional_contains_na(X, self.variables_) - for feature in self.encoder_dict_.keys(): + for feature in self.variables: X[feature] = pd.Categorical( - X[feature], categories=self.encoder_dict_[feature] + X[feature], + # categories are sorted to ensure consistency between train and test set + categories=sorted( + self.encoder_dict_[feature], key=self.encoder_dict_[feature].get + ), ) if self.unseen == "raise": self._check_nan_values_after_transformation(X) return X + + def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: + """Convert the encoded variable back to the original values. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features]. + The transformed dataframe. + + Returns + ------- + X_tr: pandas dataframe of shape = [n_samples, n_features]. + The un-transformed dataframe, with the categorical variables containing the + original values. + """ + X = self._check_transform_input_and_state(X) + + # replace encoded categories by the original values + for feature in self.encoder_dict_.keys(): + inv_map = {v: k for k, v in self.encoder_dict_[feature].items()} + X[feature] = X[feature].cat.codes.map(inv_map) + + return X diff --git a/tests/test_encoding/test_pandas_categorical_encoder.py b/tests/test_encoding/test_pandas_categorical_encoder.py index ba4930ac2..9ee2d0160 100644 --- a/tests/test_encoding/test_pandas_categorical_encoder.py +++ b/tests/test_encoding/test_pandas_categorical_encoder.py @@ -2,6 +2,7 @@ from feature_engine.encoding import PandasCategoricalEncoder + def test_fit_with_specified_variables(): """ Test fitting the transformer with specified variables. @@ -11,7 +12,7 @@ def test_fit_with_specified_variables(): transformer.fit(df) assert transformer.variables == ["A"] - assert transformer.encoder_dict_ == {"A": ["a", "b", "c"]} + assert transformer.encoder_dict_ == {"A": {"a": 0, "b": 1, "c": 2}} def test_fit_with_all_object_variables(): @@ -22,15 +23,19 @@ def test_fit_with_all_object_variables(): transformer = PandasCategoricalEncoder() transformer.fit(df) - assert transformer.variables == ["A", "B"] - assert transformer.encoder_dict_ == {"A": ["a", "b", "c"], "B": ["x", "y", "z"]} + assert transformer.variables_ == ["A", "B"] + assert transformer.encoder_dict_ == { + "A": {"a": 0, "b": 1, "c": 2}, + "B": {"x": 0, "y": 1, "z": 2}, + } -def test_transform(): +def test_transform_alphabetically_unordered_category(): """ - Test transforming the dataframe with the fitted transformer. + Test transforming a dataframe with a category that is not alphabetically ordered + (c). """ - df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + df = pd.DataFrame({"A": ["c", "a", "b", "a"], "B": ["x", "y", "x", "z"]}) transformer = PandasCategoricalEncoder(variables=["A", "B"]) transformer.fit(df) transformed_df = transformer.transform(df) @@ -39,6 +44,13 @@ def test_transform(): assert transformed_df["B"].dtype.name == "category" assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] assert list(transformed_df["B"].cat.categories) == ["x", "y", "z"] + assert transformed_df["A"].cat.codes.tolist() == [2, 0, 1, 0] + assert transformed_df["B"].cat.codes.tolist() == [0, 1, 0, 2] + assert transformer.variables_ == ["A", "B"] + assert transformer.encoder_dict_ == { + "A": {"a": 0, "b": 1, "c": 2}, + "B": {"x": 0, "y": 1, "z": 2}, + } def test_transform_with_unseen_data(): @@ -46,7 +58,7 @@ def test_transform_with_unseen_data(): Test transforming the dataframe with unseen data. """ df_train = pd.DataFrame({"A": ["a", "c", "b", "a"], "B": ["x", "y", "x", "z"]}) - df_test = pd.DataFrame({"A": ["a", "b", "c", "d"], "B": ["x", "y", "z", "w"]}) + df_test = pd.DataFrame({"A": ["a", "b", "c", "unseen"], "B": ["x", "y", "z", "w"]}) transformer = PandasCategoricalEncoder(variables=["A", "B"]) transformed_train_df = transformer.fit_transform(df_train) transformed_test_df = transformer.transform(df_test) @@ -59,7 +71,7 @@ def test_transform_with_unseen_data(): assert transformed_test_df["B"].isnull().tolist() == [False, False, False, True] # Check that the category codes are consistent between the training and test sets - # Expected codes: a=0, b=1, c=2, d=-1 + # Expected codes: a=0, b=1, c=2, unseen=-1 assert transformed_train_df["A"].cat.codes.tolist() == [0, 2, 1, 0] assert transformed_test_df["A"].cat.codes.tolist() == [0, 1, 2, -1] @@ -69,7 +81,9 @@ def test_transform_with_missing_values(): Test transforming the dataframe with missing values. """ df = pd.DataFrame({"A": ["a", "b", None, "c"], "B": ["x", None, "x", "z"]}) - transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformer = PandasCategoricalEncoder( + variables=["A", "B"], missing_values="ignore" + ) transformer.fit(df) transformed_df = transformer.transform(df) @@ -79,6 +93,11 @@ def test_transform_with_missing_values(): assert list(transformed_df["B"].cat.categories) == ["x", "z"] assert transformed_df["A"].isnull().sum() == 1 assert transformed_df["B"].isnull().sum() == 1 + assert transformer.variables_ == ["A", "B"] + assert transformer.encoder_dict_ == { + "A": {"a": 0, "b": 1, "c": 2}, + "B": {"x": 0, "z": 1}, + } def test_fit_transform(): @@ -93,4 +112,18 @@ def test_fit_transform(): assert transformed_df["B"].dtype.name == "category" assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] assert list(transformed_df["B"].cat.categories) == ["x", "y", "z"] + assert transformed_df["A"].cat.codes.tolist() == [0, 1, 0, 2] + assert transformed_df["B"].cat.codes.tolist() == [0, 1, 0, 2] + assert transformer.variables_ == ["A", "B"] + assert transformer.encoder_dict_ == { + "A": {"a": 0, "b": 1, "c": 2}, + "B": {"x": 0, "y": 1, "z": 2}, + } + +def test_inverse_transform(): + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformed_df = transformer.fit_transform(df) + inverse_df = transformer.inverse_transform(transformed_df) + pd.testing.assert_frame_equal(df, inverse_df) \ No newline at end of file From 1bf88e6166fc357a60aa84a927b2fe9261e366e9 Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:49:01 +0100 Subject: [PATCH 4/5] Add more tests --- feature_engine/encoding/pandas_categorical.py | 5 +- .../test_pandas_categorical_encoder.py | 156 +++++++++++++++++- 2 files changed, 154 insertions(+), 7 deletions(-) diff --git a/feature_engine/encoding/pandas_categorical.py b/feature_engine/encoding/pandas_categorical.py index 7b4ff4ded..d9362bb18 100644 --- a/feature_engine/encoding/pandas_categorical.py +++ b/feature_engine/encoding/pandas_categorical.py @@ -188,7 +188,7 @@ def transform(self, X): if self.missing_values == "raise": _check_optional_contains_na(X, self.variables_) - for feature in self.variables: + for feature in self.variables_: X[feature] = pd.Categorical( X[feature], # categories are sorted to ensure consistency between train and test set @@ -197,8 +197,7 @@ def transform(self, X): ), ) - if self.unseen == "raise": - self._check_nan_values_after_transformation(X) + self._check_nan_values_after_transformation(X) return X diff --git a/tests/test_encoding/test_pandas_categorical_encoder.py b/tests/test_encoding/test_pandas_categorical_encoder.py index 9ee2d0160..deb591a71 100644 --- a/tests/test_encoding/test_pandas_categorical_encoder.py +++ b/tests/test_encoding/test_pandas_categorical_encoder.py @@ -1,4 +1,6 @@ import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError from feature_engine.encoding import PandasCategoricalEncoder @@ -53,15 +55,26 @@ def test_transform_alphabetically_unordered_category(): } -def test_transform_with_unseen_data(): +def test_transform_with_unseen_data_and_unseen_is_ignore(): """ Test transforming the dataframe with unseen data. """ df_train = pd.DataFrame({"A": ["a", "c", "b", "a"], "B": ["x", "y", "x", "z"]}) - df_test = pd.DataFrame({"A": ["a", "b", "c", "unseen"], "B": ["x", "y", "z", "w"]}) + df_test = pd.DataFrame( + {"A": ["a", "b", "c", "unseen"], "B": ["x", "y", "z", "unseen"]} + ) transformer = PandasCategoricalEncoder(variables=["A", "B"]) transformed_train_df = transformer.fit_transform(df_train) - transformed_test_df = transformer.transform(df_test) + + with pytest.warns(UserWarning) as record: + transformed_test_df = transformer.transform(df_test) + + msg = "During the encoding, NaN values were introduced in the feature(s) A, B." + + # check that only one warning was raised + assert len(record) == 1 + # check that the message matches + assert record[0].message.args[0] == msg assert transformed_test_df["A"].dtype.name == "category" assert transformed_test_df["B"].dtype.name == "category" @@ -76,6 +89,101 @@ def test_transform_with_unseen_data(): assert transformed_test_df["A"].cat.codes.tolist() == [0, 1, 2, -1] +def test_transform_with_unseen_data_and_unseen_is_raise(): + """ + Test transforming the dataframe with unseen data. + """ + df_train = pd.DataFrame({"A": ["a", "c", "b", "a"], "B": ["x", "y", "x", "z"]}) + df_test = pd.DataFrame( + {"A": ["a", "b", "c", "unseen"], "B": ["x", "y", "z", "unseen"]} + ) + transformer = PandasCategoricalEncoder(variables=["A", "B"], unseen="raise") + msg = "During the encoding, NaN values were introduced in the feature(s) A, B." + + transformer.fit_transform(df_train) + with pytest.raises(ValueError) as record: + transformer.transform(df_test) + + assert str(record.value) == msg + + +def test_fit_raises_error_if_df_contains_na(): + """ + Test that the transform method raises an error if the dataframe contains missing + values. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + df.loc[2, "A"] = None + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + + with pytest.raises(ValueError) as record: + transformer.fit(df) + + msg = ( + "Some of the variables in the dataset contain NaN. Check and " + "remove those before using this transformer or set the parameter " + "`missing_values='ignore'` when initialising this transformer." + ) + assert str(record.value) == msg + + +def test_transform_raises_error_if_df_contains_na(): + """ + Test that the transform method raises an error if the dataframe contains missing + values. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformer.fit(df) + + df.loc[2, "A"] = None + + with pytest.raises(ValueError) as record: + transformer.transform(df) + + msg = ( + "Some of the variables in the dataset contain NaN. Check and " + "remove those before using this transformer or set the parameter " + "`missing_values='ignore'` when initialising this transformer." + ) + assert str(record.value) == msg + + +def test_arbitrary_encoding_automatically_find_variables_ignore_format(): + """ + Test the ignore_format parameter. + """ + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": [1, 2, 1, 3]}) + transformer = PandasCategoricalEncoder(ignore_format=True) + transformer.fit(df) + transformed_df = transformer.transform(df) + + assert transformer.variables_ == ["A", "B"] + assert transformer.encoder_dict_ == { + "A": {"a": 0, "b": 1, "c": 2}, + "B": {1: 0, 2: 1, 3: 2}, + } + assert transformed_df["A"].dtype.name == "category" + assert transformed_df["B"].dtype.name == "category" + assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] + assert list(transformed_df["B"].cat.categories) == [1, 2, 3] + assert transformed_df["A"].cat.codes.tolist() == [0, 1, 0, 2] + assert transformed_df["B"].cat.codes.tolist() == [0, 1, 0, 2] + + +def test_ordered_encoding_1_variable_ignore_format(): + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": [1, 2, 1, 3]}) + transformer = PandasCategoricalEncoder(ignore_format=True, variables=["A"]) + transformer.fit(df) + transformed_df = transformer.transform(df) + + assert transformer.variables_ == ["A"] + assert transformer.encoder_dict_ == {"A": {"a": 0, "b": 1, "c": 2}} + assert transformed_df["A"].dtype.name == "category" + assert list(transformed_df["A"].cat.categories) == ["a", "b", "c"] + assert transformed_df["A"].cat.codes.tolist() == [0, 1, 0, 2] + + def test_transform_with_missing_values(): """ Test transforming the dataframe with missing values. @@ -120,10 +228,50 @@ def test_fit_transform(): "B": {"x": 0, "y": 1, "z": 2}, } + +@pytest.mark.parametrize( + "unseen", ["pizza", "encode", False, 1, ("raise", "ignore"), ["ignore"]] +) +def test_error_if_unseen_not_permitted_value(unseen): + with pytest.raises(ValueError): + PandasCategoricalEncoder(unseen=unseen) + + def test_inverse_transform(): df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) transformer = PandasCategoricalEncoder(variables=["A", "B"]) transformed_df = transformer.fit_transform(df) inverse_df = transformer.inverse_transform(transformed_df) - pd.testing.assert_frame_equal(df, inverse_df) \ No newline at end of file + pd.testing.assert_frame_equal(df, inverse_df) + + +def test_inverse_transform_when_no_unseen(): + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + transformer.fit(df) + transformed_df = transformer.transform(df) + inverse_df = transformer.inverse_transform(transformed_df) + + pd.testing.assert_frame_equal(df, inverse_df) + + +def test_inverse_transform_when_ignore_unseen(): + df1 = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + df2 = pd.DataFrame({"A": ["a", "b", "d", "c"], "B": ["x", "y", "x", "w"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"], unseen="ignore") + transformer.fit(df1) + transformed_df = transformer.transform(df2) + + inverse_df = transformer.inverse_transform(transformed_df) + expected_df = pd.DataFrame({"A": ["a", "b", None, "c"], "B": ["x", "y", "x", None]}) + pd.testing.assert_frame_equal(inverse_df, expected_df) + + +def test_inverse_transform_raises_non_fitted_error(): + df = pd.DataFrame({"A": ["a", "b", "a", "c"], "B": ["x", "y", "x", "z"]}) + transformer = PandasCategoricalEncoder(variables=["A", "B"]) + + # Test when fit is not called prior to transform. + with pytest.raises(NotFittedError): + transformer.inverse_transform(df) From 94a3593948fcd4626f6252c5ac7d71c5484de5dd Mon Sep 17 00:00:00 2001 From: Claudio Salvatore Arcidiacono <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:42:45 +0100 Subject: [PATCH 5/5] Fix linting issues --- feature_engine/encoding/pandas_categorical.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/feature_engine/encoding/pandas_categorical.py b/feature_engine/encoding/pandas_categorical.py index d9362bb18..1dec78fb3 100644 --- a/feature_engine/encoding/pandas_categorical.py +++ b/feature_engine/encoding/pandas_categorical.py @@ -156,12 +156,14 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self._check_na(X, variables_) self.encoder_dict_ = {} + self.ordered_categories_ = {} for feature in variables_: + self.ordered_categories_[feature] = sorted( + [val for val in X[feature].unique() if pd.notnull(val)] + ) self.encoder_dict_[feature] = { category: index - for index, category in enumerate( - sorted([val for val in X[feature].unique() if pd.notnull(val)]) - ) + for index, category in enumerate(self.ordered_categories_[feature]) } if self.unseen == "encode": @@ -180,8 +182,8 @@ def transform(self, X): X (pd.DataFrame): The input DataFrame. Returns: - pd.DataFrame: The transformed DataFrame with specified columns converted to categorical - dtype. + pd.DataFrame: The transformed DataFrame with specified columns converted to + categorical dtype. """ X = self._check_transform_input_and_state(X) # check if dataset contains na @@ -192,9 +194,7 @@ def transform(self, X): X[feature] = pd.Categorical( X[feature], # categories are sorted to ensure consistency between train and test set - categories=sorted( - self.encoder_dict_[feature], key=self.encoder_dict_[feature].get - ), + categories=self.ordered_categories_[feature], ) self._check_nan_values_after_transformation(X)