From 22dd5b31b46990d847380beee1952059bbc982d4 Mon Sep 17 00:00:00 2001
From: Claudio Salvatore Arcidiacono
 <22871978+ClaudioSalvatoreArcidiacono@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:58:49 +0100
Subject: [PATCH] First draft for WOE

---
 Makefile                     |  25 ++++
 README.md                    |   2 +-
 development_guide.md         |  12 ++
 pyproject.toml               |  51 +++++++
 requirements.txt             |  20 +++
 sklearo/__init__.py          |   0
 sklearo/encoding/__init__.py |   3 +
 sklearo/encoding/woe.py      | 250 +++++++++++++++++++++++++++++++++++
 sklearo/utils.py             |  26 ++++
 9 files changed, 388 insertions(+), 1 deletion(-)
 create mode 100644 Makefile
 create mode 100644 development_guide.md
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt
 create mode 100644 sklearo/__init__.py
 create mode 100644 sklearo/encoding/__init__.py
 create mode 100644 sklearo/encoding/woe.py
 create mode 100644 sklearo/utils.py

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a93bf53
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+env-create:
+	pip install --upgrade pip
+	if ! test -d venv; \
+	then \
+		echo creating virtual environment; \
+		pip install --upgrade virtualenv; \
+		python -m venv venv; \
+	fi
+
+env-install:
+	pip install --upgrade pip
+	if test -s requirements.txt; \
+	then \
+		echo Installing requirements from requirements.txt; \
+		pip install -r requirements.txt ; \
+		pip install -e . --no-deps ; \
+	else \
+		echo Installing requirements from pyproject.toml; \
+		pip install -e '.[dev]'; \
+		pip freeze --exclude-editable > requirements.txt; \
+	fi
+
+env-update:
+	pip install -e '.[dev]'
+	pip freeze --exclude-editable > requirements.txt
diff --git a/README.md b/README.md
index a99afec..ba8ee0f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,2 @@
 # sklearo
-A versatile Python package featuring scikit-learn like transformers for feature preprocessing, compatible with all kind of dataframes thanks to narwals.
+A versatile Python package featuring scikit-learn like transformers for feature preprocessing, compatible with all kind of dataframes thanks to narwhals.
diff --git a/development_guide.md b/development_guide.md
new file mode 100644
index 0000000..2cfb5d9
--- /dev/null
+++ b/development_guide.md
@@ -0,0 +1,12 @@
+# Development Guide
+
+## Installing dev dependencies
+
+```bash
+# Create a new virtual environment
+python -m venv venv
+# Activate the virtual environment
+source venv/bin/activate
+# Install the dependencies
+make env-install
+```
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..3d03a39
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,51 @@
+[project]
+name = "sklearo"
+description = "A versatile Python package featuring scikit-learn like transformers for feature preprocessing, compatible with all kind of dataframes thanks to narwhals."
+version = "0.1.0"
+keywords = ["feature preprocessing", "scikit-learn", "machine learning"]
+authors = [
+    { name = "Claudio Salvatore Arcidiacono", email = "author@email.com" },
+]
+readme = "README.md"
+requires-python = ">=3.9"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+dependencies = ["narwhals", "pydantic"]
+
+[project.optional-dependencies]
+dev = ["black", "ruff", "pre-commit", "pytest", "polars"]
+doc = ["mkdocs", "mkdocs-material", "mkdocstrings[python]", "mkdocs-jupyter"]
+build = ["build", "twine"]
+
+[build-system]
+build-backend = "flit_core.buildapi"
+requires = ["flit_core >=3.2,<4"]
+
+[project.urls]
+"Homepage" = "https://github.com/ClaudioSalvatoreArcidiacono/sklearo"
+"Documentation" = "https://claudiosalvatorearcidiacono.github.io/sklearo/"
+"Bug Tracker" = "https://github.com/ClaudioSalvatoreArcidiacono/sklearo/issues"
+
+[tool.black]
+line-length = 88
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''
+
+[tool.flake8]
+max-line-length = 88
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b842f94
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+black==24.10.0
+cfgv==3.4.0
+click==8.1.7
+distlib==0.3.9
+filelock==3.16.1
+identify==2.6.3
+iniconfig==2.0.0
+mypy-extensions==1.0.0
+narwhals==1.15.2
+nodeenv==1.9.1
+packaging==24.2
+pathspec==0.12.1
+platformdirs==4.3.6
+pluggy==1.5.0
+polars==1.16.0
+pre_commit==4.0.1
+pytest==8.3.4
+PyYAML==6.0.2
+ruff==0.8.2
+virtualenv==20.28.0
diff --git a/sklearo/__init__.py b/sklearo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sklearo/encoding/__init__.py b/sklearo/encoding/__init__.py
new file mode 100644
index 0000000..266528e
--- /dev/null
+++ b/sklearo/encoding/__init__.py
@@ -0,0 +1,3 @@
+from .woe import WOEEncoder
+
+__all__ = ["WOEEncoder"]
\ No newline at end of file
diff --git a/sklearo/encoding/woe.py b/sklearo/encoding/woe.py
new file mode 100644
index 0000000..db618cc
--- /dev/null
+++ b/sklearo/encoding/woe.py
@@ -0,0 +1,250 @@
+import narwhals as nw
+from narwhals.typing import IntoFrameT, IntoSeriesT
+import warnings
+import math
+from typing import Sequence, Literal, Optional
+
+from pydantic import validate_arguments
+from sklearo.utils import select_columns
+
+
+class WOEEncoder:
+    """Weight of Evidence (WOE) Encoder.
+
+    This class provides functionality to encode categorical features using the Weight of Evidence
+    (WOE) technique. WOE is commonly used in credit scoring and other binary classification problems
+    to transform categorical variables into continuous variables.
+
+    WOE is defined as the natural logarithm of the ratio of the distribution of goods (i.e. the
+    negative class, 0) to the distribution of bads (i.e. the positive class, 1) for a
+    given category.
+
+    ```
+    WOE = ln((% of goods) / (% of bads))
+    ```
+
+    The WOE value is positive if the category is more likely to be good (negative class) and
+    negative if it is more likely to be bad (positive class). This means that the WOE should be
+    inversely correlated to the target variable.
+
+    The WOE encoding is useful for
+    logistic regression and other linear models, as it transforms the categorical variables into
+    continuous variables that can be used as input features.
+
+    Args:
+        columns (str, list[str], list[nw.typing.DTypes]): list of columns to encode.
+            If a single string is passed instead, it is treated as a regular expression pattern to
+            match column names. If a list of `narwhals.typing.DTypes` is passed, it will select
+            all columns matching the specified dtype. Defaults to [narwhals.Categorical,
+            narwhals.String].
+        underrepresented_categories (str): Strategy to handle underrepresented categories.
+            If 'raise', an error is raised when a category is missing one of the target classes. If
+            'fill', the missing categories are encoded using the fill_values_underrepresented
+            values.
+        fill_values_underrepresented (list[int, float]): Fill values to use for underrepresented
+            categories. The first value is used when there are no goods and the second value when
+            there are no bads. Only used when underrepresented_categories is set to 'fill'.
+            Optional, Defaults to (-999.0, 999.0).
+        unseen (str): Strategy to handle unseen categories. If 'raise', an error is raised when
+            unseen categories are found. If 'ignore', the unseen categories are encoded with the
+            fill_value_unseen.
+        fill_value_unseen (int, float): Fill value to use for unseen categories. Only used when
+            unseen is set to 'ignore'. Optional, Defaults to 0.0.
+        missing_values (str): Strategy to handle missing values. If 'encode', missing values are
+            initially encoded as 'MISSING' and the WOE is computed as if it were a regular category.
+            If 'ignore', missing values are left as is. If 'raise', an error is raised when missing
+            values are found.
+        suffix (str): Suffix to append to the column names of the encoded columns. If an empty
+            string is passed, the original column names are replaced. Optional, Defaults to "".
+
+    Attributes:
+        columns_ (list): List of columns to be encoded, learned during fit.
+        encoding_map_ (dict): Dictionary mapping columns to their WOE values, learned during fit.
+
+    Examples:
+        ```python
+        import pandas as pd
+        from sklearo.encoding import WOEEncoder
+
+        data = {
+            "category": ["A", "B", "A", "C", "B", "C", "A", "B", "C"],
+            "target": [0, 0, 1, 0, 1, 0, 1, 0, 1],
+        }
+        df = pd.DataFrame(data)
+
+        encoder = WOEEncoder()
+        encoder.fit(df[["category"]], df["target"])
+        encoded = encoder.transform(df[["category"]])
+        print(encoded)
+           category
+        0 -0.693147
+        1  0.693147
+        2 -0.693147
+        3  0.693147
+        4  0.693147
+        5  0.693147
+        6 -0.693147
+        7  0.693147
+        8  0.693147
+        ```
+    """
+
+    @validate_arguments(config=dict(arbitrary_types_allowed=True))
+    def __init__(
+        self,
+        columns: Sequence[nw.typing.DTypes | str] | str = (
+            nw.Categorical,
+            nw.String,
+        ),
+        underrepresented_categories: Literal["raise", "fill"] = "raise",
+        fill_values_underrepresented: Sequence[int | float | None] | None = (
+            -999.0,
+            999.0,
+        ),
+        unseen: Literal["raise", "ignore"] = "raise",
+        fill_value_unseen: int | float | None = 0.0,
+        missing_values: Literal["encode", "ignore", "raise"] = "encode",
+        suffix: str = "",
+    ) -> None:
+        self.columns = columns
+        self.underrepresented_categories = underrepresented_categories
+        self.missing_values = missing_values
+        self.fill_values_underrepresented = fill_values_underrepresented or (None, None)
+        self.unseen = unseen
+        self.fill_value_unseen = fill_value_unseen
+        self.suffix = suffix
+
+    def _handle_missing_values(self, x: IntoSeriesT) -> IntoSeriesT:
+        if self.missing_values == "ignore":
+            return x
+        if self.missing_values == "raise":
+            if x.null_count() > 0:
+                raise ValueError(
+                    f"Column {x.name} has missing values. "
+                    "Please handle missing values before encoding or set "
+                    "missing_values to either 'ignore' or 'encode'."
+                )
+        if self.missing_values == "encode":
+            return x.fill_null("MISSING")
+
+    def _calculate_woe(
+        self, x: IntoSeriesT, y: IntoSeriesT, total_goods: int, total_bads: int
+    ) -> dict[str, dict[str, float | int | None]]:
+        """Calculate the Weight of Evidence for a column."""
+
+        categories_n_goods_n_bads_dist_ratio = (
+            x.to_frame()
+            .with_columns(y)
+            .group_by(x.name)
+            .agg(
+                n_total=nw.col(y.name).count(),
+                n_bads=nw.col(y.name).sum(),
+            )
+            .with_columns(n_goods=nw.col("n_total") - nw.col("n_bads"))
+            .with_columns(
+                perc_goods=nw.col("n_goods") / total_goods,
+                perc_bads=nw.col("n_bads") / total_bads,
+            )
+            .with_columns(
+                dist_ratio=nw.col("perc_bads") / nw.col("perc_goods")
+            )
+            .select(x.name, "n_goods", "n_bads", "dist_ratio")
+            .rows()
+        )
+        categories, n_goods, n_bads, dist_ratios = zip(*categories_n_goods_n_bads_dist_ratio)
+
+        total_goods = sum(n_goods)
+        total_bads = sum(n_bads)
+
+        if any(n_good == 0 for n_good in n_goods) or any(
+            n_bad == 0 for n_bad in n_bads
+        ):
+            problematic_categories = [
+                cat
+                for cat, n_good, n_bad in zip(categories, n_goods, n_bads)
+                if n_good == 0 or n_bad == 0
+            ]
+            msg = (
+                f"The categories {problematic_categories} for the column {x.name} "
+                "are missing one of the target classes. For WOE to be defined, all categories "
+                "should have at least one observation of each target class. Please consider "
+                "removing infrequent categories using a RareLabelEncoder"
+            )
+            if self.underrepresented_categories == "raise":
+                raise ValueError(
+                    msg + " or by setting underrepresented_categories to 'fill'."
+                )
+
+            else:  # fill
+                warnings.warn(
+                    msg + ". The infrequent categories will be encoded as "
+                    f"{self.fill_values_underrepresented[0]} "
+                    f"when there are no goods and with {self.fill_values_underrepresented[1]} when "
+                    "there are no bads."
+                )
+
+        woes = []
+        for dist_ratio, n_good, n_bad in zip(dist_ratios, n_goods, n_bads):
+            if n_good == 0:
+                # means there are only bads
+                woes.append(self.fill_values_underrepresented[0])
+            elif n_bad == 0:
+                # means there are only goods
+                woes.append(self.fill_values_underrepresented[1])
+            else:
+                woes.append(math.log(dist_ratio))
+
+        return dict(zip(categories, woes))
+
+    @nw.narwhalify
+    def fit(self, X: IntoFrameT, y: IntoSeriesT) -> "WOEEncoder":
+        """Fit the encoder."""
+
+        self.columns_ = select_columns(X, self.columns)
+        self.encoding_map_ = {}
+
+        total_bads = y.sum()
+        total_goods = y.count() - total_bads
+        for column in self.columns_:
+            self.encoding_map_[column] = self._calculate_woe(
+                self._handle_missing_values(X[column]), y, total_goods, total_bads
+            )
+        return self
+
+    @nw.narwhalify
+    def transform(self, X: IntoFrameT) -> IntoFrameT:
+        """Transform the data."""
+
+        unseen_per_col = {}
+        for column, mapping in self.encoding_map_.items():
+            uniques = X[column].unique()
+            unseen_cats = uniques.filter(~uniques.is_in(mapping.keys())).to_list()
+            if unseen_cats:
+                unseen_per_col[column] = unseen_cats
+
+        if unseen_per_col:
+            if self.unseen == "raise":
+                raise ValueError(
+                    f"Unseen categories {unseen_per_col} found during transform. "
+                    "Please handle unseen categories for example by using a RareLabelEncoder. "
+                    "Alternatively, set unseen to 'ignore'."
+                )
+            else:
+                warnings.warn(
+                    f"Unseen categories {unseen_per_col} found during transform. "
+                    "Please handle unseen categories for example by using a RareLabelEncoder. "
+                    f"These categories will be encoded as {self.fill_value_unseen}."
+                )
+
+        return X.with_columns(
+            nw.col(column)
+            .pipe(self._handle_missing_values)
+            .replace_strict(
+                {
+                    **mapping,
+                    **{cat: self.fill_value_unseen for cat in unseen_cats},
+                }
+            )
+            .alias(f"{column}{self.suffix}")
+            for column, mapping in self.encoding_map_.items()
+        )
diff --git a/sklearo/utils.py b/sklearo/utils.py
new file mode 100644
index 0000000..b789189
--- /dev/null
+++ b/sklearo/utils.py
@@ -0,0 +1,26 @@
+import re
+import narwhals as nw
+from narwhals.typing import IntoFrameT
+
+def select_columns_by_regex_pattern(df: IntoFrameT, pattern: str):
+    for column in df.columns:
+        if re.search(pattern, column):
+            yield column
+
+
+def select_columns_by_types(df: IntoFrameT, dtypes: list[nw.dtypes.DType]):
+    for column, dtype in zip(df.schema.names(), df.schema.dtypes()):
+        if dtype in dtypes:
+            yield column
+
+def select_columns(df: IntoFrameT, columns):
+    if isinstance(columns, str):
+        yield from select_columns_by_regex_pattern(df, columns)
+
+    if (isinstance(columns, list) or isinstance(columns, tuple)) and columns:
+        if issubclass(columns[0], nw.dtypes.DType):
+            yield from select_columns_by_types(df, columns)
+        elif isinstance(columns[0], str):
+            yield from columns
+        else:
+            raise ValueError("Invalid columns type")