diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index abce9fc..71b026b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,3 +17,12 @@ repos: rev: 24.4.2 hooks: - id: black + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.6 + hooks: + # Run the linter. + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index 8339bc0..c745516 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,3 +59,13 @@ exclude = ''' [tool.isort] profile = "black" + +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F", "B", "D"] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402"] +"**/{tests,docs,tools}/*" = ["D"] diff --git a/sklearo/__init__.py b/sklearo/__init__.py index e69de29..991b117 100644 --- a/sklearo/__init__.py +++ b/sklearo/__init__.py @@ -0,0 +1 @@ +"""This package provides the main functionalities of the sklearo library.""" diff --git a/sklearo/base.py b/sklearo/base.py index 3261d00..2813d98 100644 --- a/sklearo/base.py +++ b/sklearo/base.py @@ -1,18 +1,24 @@ +"""This module provides base classes for transformers in the encoding process.""" + from abc import ABC, abstractmethod from narwhals.typing import IntoFrameT, IntoSeriesT -class BaseTransformer(ABC): # pragma: no cover +class BaseTransformer(ABC): + """Abstract base class for all transformers.""" @abstractmethod def fit(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> None: + """Fits the transformer to the data.""" pass @abstractmethod def transform(self, X: IntoFrameT) -> IntoFrameT: + """Transforms the data.""" pass def fit_transform(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> IntoFrameT: + """Fits and transforms the data.""" self.fit(X, y) return self.transform(X) diff --git a/sklearo/cv.py b/sklearo/cv.py index 708c9d3..cf99a4b 100644 --- a/sklearo/cv.py +++ b/sklearo/cv.py @@ -1,3 +1,5 @@ +"""This module provides cross-validation utilities for model evaluation.""" + import math import narwhals as nw @@ -5,8 +7,7 @@ def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr: - """ - Perform ceiling division on a column. + """Perform ceiling division on a column. This function divides each element in the given column by the specified divisor and returns the smallest integer greater than or equal to the result of the division. @@ -22,8 +23,7 @@ def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr: def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT: - """ - Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation. + """Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation. This function divides the input DataFrame into k folds, ensuring that each fold has approximately the same number of samples. The fold IDs are assigned in a way @@ -82,8 +82,7 @@ def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT: def add_cv_fold_id_column_stratified_k_fold( X: IntoFrameT, y: IntoSeriesT, k: int = 5 ) -> IntoFrameT: - """ - Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV. + """Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV. This function ensures that each fold has approximately the same proportion of each class as the original dataset. It calculates the fold IDs based on the distribution of the target diff --git a/sklearo/encoding/__init__.py b/sklearo/encoding/__init__.py index 85c9f6d..f3c9cbc 100644 --- a/sklearo/encoding/__init__.py +++ b/sklearo/encoding/__init__.py @@ -1,3 +1,5 @@ +"""This module provides encoding techniques for categorical features.""" + from .target import TargetEncoder from .woe import WOEEncoder diff --git a/sklearo/encoding/base.py b/sklearo/encoding/base.py index 892eadd..b7a3671 100644 --- a/sklearo/encoding/base.py +++ b/sklearo/encoding/base.py @@ -1,3 +1,5 @@ +"""Base classes for encoders.""" + import warnings from abc import abstractmethod from collections import defaultdict @@ -15,14 +17,16 @@ class BaseOneToOneEncoder(BaseTransformer): + """Base class for one-to-one encoders.""" def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT: + """Handles missing values in the input data.""" if self.missing_values == "ignore": return X if self.missing_values == "raise": if max(X[self.columns_].null_count().row(0)) > 0: raise ValueError( - f"Some columns have missing values. " + "Some columns have missing values. " "Please handle missing values before encoding or set " "missing_values to either 'ignore' or 'encode'." ) @@ -40,6 +44,7 @@ def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT: class BaseTargetEncoder(BaseOneToOneEncoder): + """Abstract base class for target encoders.""" @abstractmethod def _calculate_target_statistic( @@ -49,6 +54,7 @@ def _calculate_target_statistic( raise NotImplementedError # pragma: no cover def check_target_type(self, y: IntoSeriesT) -> str: + """Check the type of the target variable.""" if hasattr(self, "target_type_"): return if not hasattr(self, "target_type") or self.target_type == "auto": @@ -71,7 +77,6 @@ def fit(self, X: IntoFrameT, y: IntoSeriesT) -> "BaseTargetEncoder": X (DataFrame): The input data. y (Series): The target variable. """ - self.check_target_type(y) self.columns_ = list(select_columns(X, self.columns)) self.encoding_map_ = {} @@ -163,7 +168,8 @@ def transform(self, X: IntoFrameT) -> IntoFrameT: warnings.warn( f"Unseen categories {unseen_per_col} found during transform. " "Please handle unseen categories for example by using a RareLabelEncoder. " - f"These categories will be encoded as {self.fill_value_unseen}." + f"These categories will be encoded as {self.fill_value_unseen}.", + stacklevel=2, ) if self.target_type_ in ("binary", "continuous"): @@ -174,6 +180,7 @@ def transform(self, X: IntoFrameT) -> IntoFrameT: @check_if_fitted def get_feature_names_out(self) -> list[str]: + """Get the output feature names.""" if self.target_type_ in ("binary", "continuous"): return self.feature_names_in_ diff --git a/sklearo/encoding/target.py b/sklearo/encoding/target.py index af0ad9c..b155308 100644 --- a/sklearo/encoding/target.py +++ b/sklearo/encoding/target.py @@ -1,4 +1,6 @@ -from typing import Any, Literal, Sequence, Tuple +"""TargetEncoder class for encoding categorical features using the Target Encoding technique.""" + +from typing import Literal, Sequence import narwhals as nw from narwhals.typing import IntoFrameT @@ -9,8 +11,7 @@ class TargetEncoder(BaseTargetEncoder): - """ - Target Encoder for categorical features. + """Target Encoder for categorical features. This class provides functionality to encode categorical features using the Target Encoding technique. Target Encoding replaces each category with the mean of the target variable for that @@ -51,6 +52,10 @@ class TargetEncoder(BaseTargetEncoder): - If `'multiclass'`, the target variable is multiclass. - If `'continuous'`, the target variable is continuous. + smooth (float, Literal["auto"]): Smoothing parameter to avoid overfitting. If `'auto'`, the + smoothing parameter is calculated based on the variance of the target variable. + + Attributes: columns_ (list[str]): List of columns to be encoded, learned during fit. encoding_map_ (dict[str, float]): Mapping of categories to their mean target values, learned @@ -98,7 +103,7 @@ def __init__( smooth: Literal["auto"] | float = "auto", cv: Annotated[int, Field(ge=2)] = 5, ) -> None: - + """Class constructor for TargetEncoder.""" self.columns = columns self.missing_values = missing_values self.unseen = unseen @@ -112,7 +117,6 @@ def __init__( def _calculate_target_statistic( self, x_y: IntoFrameT, target_col: str, column: str ) -> dict: - if column in ( "count_per_category", "sum_target_per_category", diff --git a/sklearo/encoding/woe.py b/sklearo/encoding/woe.py index 6aa948a..417c12c 100644 --- a/sklearo/encoding/woe.py +++ b/sklearo/encoding/woe.py @@ -1,3 +1,5 @@ +"""This module provides the Weight of Evidence (WoE) encoding techniques for categorical features.""" + import math from typing import Literal, Sequence @@ -143,6 +145,7 @@ def __init__( missing_values: Literal["encode", "ignore", "raise"] = "encode", cv: Annotated[int, Field(ge=2)] = 5, ) -> None: + """Initializes the WoEEncoder with the specified parameters.""" self.columns = columns self.underrepresented_categories = underrepresented_categories self.missing_values = missing_values diff --git a/sklearo/utils.py b/sklearo/utils.py index 3b23648..37de68d 100644 --- a/sklearo/utils.py +++ b/sklearo/utils.py @@ -1,3 +1,5 @@ +"""Utility functions for the sklearo package.""" + import inspect import re from typing import Sequence @@ -20,18 +22,21 @@ def select_columns_by_regex_pattern(df: nw.DataFrame, pattern: str): + """Selects columns from the DataFrame that match the given regex pattern.""" for column in df.columns: if re.search(pattern, column): yield column def select_columns_by_types(df: nw.DataFrame, dtypes: list[nw.dtypes.DType]): + """Selects columns from the DataFrame that match the specified data types.""" for column, dtype in zip(df.schema.names(), df.schema.dtypes()): if dtype in dtypes: yield column def select_columns(df: nw.DataFrame, columns: Sequence[nw.typing.DTypes | str] | str): + """Selects specified columns from the DataFrame.""" if isinstance(columns, str): yield from select_columns_by_regex_pattern(df, columns) diff --git a/sklearo/validation.py b/sklearo/validation.py index 8ac07c5..58daa6b 100644 --- a/sklearo/validation.py +++ b/sklearo/validation.py @@ -1,9 +1,11 @@ -from functools import wraps +"""Validation utilities for sklearo.""" -from sklearo.utils import infer_target_type +from functools import wraps def check_X_y(func): + """Decorator to check the input data X and y.""" + @wraps(func) def wrapper(self, X, y, *args, **kwargs): if not X.shape[0] == y.shape[0]: @@ -22,6 +24,8 @@ def wrapper(self, X, y, *args, **kwargs): def check_if_fitted(func): + """Decorator to check if the model is fitted before calling the method.""" + @wraps(func) def wrapper(self, *args, **kwargs): if not any( diff --git a/tests/encoding/test_target.py b/tests/encoding/test_target.py index 820e704..eb1453f 100644 --- a/tests/encoding/test_target.py +++ b/tests/encoding/test_target.py @@ -43,7 +43,6 @@ def test_target_encoder_fit_transform_comparison_with_scikit_learn(): "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"] ) class TestTargetEncoder: - @pytest.fixture def binary_class_data(self): data = { @@ -708,7 +707,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame): encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) def test_target_encoder_fit_transform(self, binary_class_data, DataFrame): - binary_class_data = DataFrame( { "category": binary_class_data["category"] * 2, @@ -746,7 +744,6 @@ def test_target_encoder_fit_transform(self, binary_class_data, DataFrame): def test_target_encoder_fit_transform_set_smoothing( self, binary_class_data, DataFrame ): - binary_class_data = DataFrame( { "category": binary_class_data["category"] * 2, diff --git a/tests/encoding/test_woe.py b/tests/encoding/test_woe.py index 66596dd..882dcd2 100644 --- a/tests/encoding/test_woe.py +++ b/tests/encoding/test_woe.py @@ -10,7 +10,6 @@ "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"] ) class TestWOEEncoder: - @pytest.fixture def binary_class_data(self): # fmt: off @@ -340,7 +339,7 @@ def test_woe_encoder_handle_missing_values_multi_class( missing_values="encode", underrepresented_categories="fill" ) encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) - transformed = encoder.transform(multi_class_data[["category"]]) + encoder.transform(multi_class_data[["category"]]) assert "MISSING" in encoder.encoding_map_["category"][1] @@ -520,7 +519,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame): encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame): - binary_class_data = DataFrame( { "category": binary_class_data["category"] * 2, diff --git a/tests/test_cv.py b/tests/test_cv.py index 9cb21a2..f5f0b35 100644 --- a/tests/test_cv.py +++ b/tests/test_cv.py @@ -13,7 +13,6 @@ "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"] ) class TestCVFunctions: - def test_add_cv_fold_id_column_k_fold(self, DataFrame): data = { "A": range(10), diff --git a/tests/test_utils.py b/tests/test_utils.py index f906093..6bbba3e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -19,7 +19,6 @@ "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"] ) class TestSelectColumns: - @pytest.fixture def sample_data(self): data = { @@ -81,7 +80,6 @@ def test_select_columns_invalid_type(self, sample_data, DataFrame): @pytest.mark.parametrize("Series", [pd.Series, pl.Series], ids=["pandas", "polars"]) class TestTypeOfTarget: - @pytest.mark.parametrize( "data, expected", [