Add ruff as a linter

ClaudioSalvatoreArcidiacono · Jan 7, 2025 · c7cc083 · c7cc083
1 parent e27c918
commit c7cc083
Show file tree

Hide file tree

Showing 15 changed files with 68 additions and 26 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,3 +17,12 @@ repos:
       rev: 24.4.2
       hooks:
           - id: black
+    - repo: https://github.com/astral-sh/ruff-pre-commit
+      # Ruff version.
+      rev: v0.8.6
+      hooks:
+        # Run the linter.
+        - id: ruff
+          args: [ --fix ]
+        # Run the formatter.
+        - id: ruff-format
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,3 +59,13 @@ exclude = '''
 
 [tool.isort]
 profile = "black"
+
+[tool.ruff.lint]
+select = ["E4", "E7", "E9", "F", "B", "D"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402"]
+"**/{tests,docs,tools}/*" = ["D"]
diff --git a/sklearo/__init__.py b/sklearo/__init__.py
@@ -0,0 +1 @@
+"""This package provides the main functionalities of the sklearo library."""
diff --git a/sklearo/base.py b/sklearo/base.py
@@ -1,18 +1,24 @@
+"""This module provides base classes for transformers in the encoding process."""
+
 from abc import ABC, abstractmethod
 
 from narwhals.typing import IntoFrameT, IntoSeriesT
 
 
-class BaseTransformer(ABC):  # pragma: no cover
+class BaseTransformer(ABC):
+    """Abstract base class for all transformers."""
 
     @abstractmethod
     def fit(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> None:
+        """Fits the transformer to the data."""
         pass
 
     @abstractmethod
     def transform(self, X: IntoFrameT) -> IntoFrameT:
+        """Transforms the data."""
         pass
 
     def fit_transform(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> IntoFrameT:
+        """Fits and transforms the data."""
         self.fit(X, y)
         return self.transform(X)
diff --git a/sklearo/cv.py b/sklearo/cv.py
@@ -1,12 +1,13 @@
+"""This module provides cross-validation utilities for model evaluation."""
+
 import math
 
 import narwhals as nw
 from narwhals.typing import IntoFrameT, IntoSeriesT
 
 
 def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr:
-    """
-    Perform ceiling division on a column.
+    """Perform ceiling division on a column.
 
     This function divides each element in the given column by the specified divisor
     and returns the smallest integer greater than or equal to the result of the division.
@@ -22,8 +23,7 @@ def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr:
 
 
 def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT:
-    """
-    Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation.
+    """Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation.
 
     This function divides the input DataFrame into k folds, ensuring that each fold
     has approximately the same number of samples. The fold IDs are assigned in a way
@@ -82,8 +82,7 @@ def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT:
 def add_cv_fold_id_column_stratified_k_fold(
     X: IntoFrameT, y: IntoSeriesT, k: int = 5
 ) -> IntoFrameT:
-    """
-    Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV.
+    """Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV.
 
     This function ensures that each fold has approximately the same proportion of each class
     as the original dataset. It calculates the fold IDs based on the distribution of the target

diff --git a/sklearo/encoding/__init__.py b/sklearo/encoding/__init__.py
@@ -1,3 +1,5 @@
+"""This module provides encoding techniques for categorical features."""
+
 from .target import TargetEncoder
 from .woe import WOEEncoder
 

diff --git a/sklearo/encoding/base.py b/sklearo/encoding/base.py
@@ -1,3 +1,5 @@
+"""Base classes for encoders."""
+
 import warnings
 from abc import abstractmethod
 from collections import defaultdict
@@ -15,14 +17,16 @@
 
 
 class BaseOneToOneEncoder(BaseTransformer):
+    """Base class for one-to-one encoders."""
 
     def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT:
+        """Handles missing values in the input data."""
         if self.missing_values == "ignore":
             return X
         if self.missing_values == "raise":
             if max(X[self.columns_].null_count().row(0)) > 0:
                 raise ValueError(
-                    f"Some columns have missing values. "
+                    "Some columns have missing values. "
                     "Please handle missing values before encoding or set "
                     "missing_values to either 'ignore' or 'encode'."
                 )
@@ -40,6 +44,7 @@ def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT:
 
 
 class BaseTargetEncoder(BaseOneToOneEncoder):
+    """Abstract base class for target encoders."""
 
     @abstractmethod
     def _calculate_target_statistic(
@@ -49,6 +54,7 @@ def _calculate_target_statistic(
         raise NotImplementedError  # pragma: no cover
 
     def check_target_type(self, y: IntoSeriesT) -> str:
+        """Check the type of the target variable."""
         if hasattr(self, "target_type_"):
             return
         if not hasattr(self, "target_type") or self.target_type == "auto":
@@ -71,7 +77,6 @@ def fit(self, X: IntoFrameT, y: IntoSeriesT) -> "BaseTargetEncoder":
             X (DataFrame): The input data.
             y (Series): The target variable.
         """
-
         self.check_target_type(y)
         self.columns_ = list(select_columns(X, self.columns))
         self.encoding_map_ = {}
@@ -163,7 +168,8 @@ def transform(self, X: IntoFrameT) -> IntoFrameT:
                 warnings.warn(
                     f"Unseen categories {unseen_per_col} found during transform. "
                     "Please handle unseen categories for example by using a RareLabelEncoder. "
-                    f"These categories will be encoded as {self.fill_value_unseen}."
+                    f"These categories will be encoded as {self.fill_value_unseen}.",
+                    stacklevel=2,
                 )
 
         if self.target_type_ in ("binary", "continuous"):
@@ -174,6 +180,7 @@ def transform(self, X: IntoFrameT) -> IntoFrameT:
 
     @check_if_fitted
     def get_feature_names_out(self) -> list[str]:
+        """Get the output feature names."""
         if self.target_type_ in ("binary", "continuous"):
             return self.feature_names_in_
 

diff --git a/sklearo/encoding/target.py b/sklearo/encoding/target.py
@@ -1,4 +1,6 @@
-from typing import Any, Literal, Sequence, Tuple
+"""TargetEncoder class for encoding categorical features using the Target Encoding technique."""
+
+from typing import Literal, Sequence
 
 import narwhals as nw
 from narwhals.typing import IntoFrameT
@@ -9,8 +11,7 @@
 
 
 class TargetEncoder(BaseTargetEncoder):
-    """
-    Target Encoder for categorical features.
+    """Target Encoder for categorical features.
 
     This class provides functionality to encode categorical features using the Target Encoding
     technique. Target Encoding replaces each category with the mean of the target variable for that
@@ -51,6 +52,10 @@ class TargetEncoder(BaseTargetEncoder):
             - If `'multiclass'`, the target variable is multiclass.
             - If `'continuous'`, the target variable is continuous.
 
+        smooth (float, Literal["auto"]): Smoothing parameter to avoid overfitting. If `'auto'`, the
+            smoothing parameter is calculated based on the variance of the target variable.
+
+
     Attributes:
         columns_ (list[str]): List of columns to be encoded, learned during fit.
         encoding_map_ (dict[str, float]): Mapping of categories to their mean target values, learned
@@ -98,7 +103,7 @@ def __init__(
         smooth: Literal["auto"] | float = "auto",
         cv: Annotated[int, Field(ge=2)] = 5,
     ) -> None:
-
+        """Class constructor for TargetEncoder."""
         self.columns = columns
         self.missing_values = missing_values
         self.unseen = unseen
@@ -112,7 +117,6 @@ def __init__(
     def _calculate_target_statistic(
         self, x_y: IntoFrameT, target_col: str, column: str
     ) -> dict:
-
         if column in (
             "count_per_category",
             "sum_target_per_category",

diff --git a/sklearo/encoding/woe.py b/sklearo/encoding/woe.py
@@ -1,3 +1,5 @@
+"""This module provides the Weight of Evidence (WoE) encoding techniques for categorical features."""
+
 import math
 from typing import Literal, Sequence
 
@@ -143,6 +145,7 @@ def __init__(
         missing_values: Literal["encode", "ignore", "raise"] = "encode",
         cv: Annotated[int, Field(ge=2)] = 5,
     ) -> None:
+        """Initializes the WoEEncoder with the specified parameters."""
         self.columns = columns
         self.underrepresented_categories = underrepresented_categories
         self.missing_values = missing_values

diff --git a/sklearo/utils.py b/sklearo/utils.py
@@ -1,3 +1,5 @@
+"""Utility functions for the sklearo package."""
+
 import inspect
 import re
 from typing import Sequence
@@ -20,18 +22,21 @@
 
 
 def select_columns_by_regex_pattern(df: nw.DataFrame, pattern: str):
+    """Selects columns from the DataFrame that match the given regex pattern."""
     for column in df.columns:
         if re.search(pattern, column):
             yield column
 
 
 def select_columns_by_types(df: nw.DataFrame, dtypes: list[nw.dtypes.DType]):
+    """Selects columns from the DataFrame that match the specified data types."""
     for column, dtype in zip(df.schema.names(), df.schema.dtypes()):
         if dtype in dtypes:
             yield column
 
 
 def select_columns(df: nw.DataFrame, columns: Sequence[nw.typing.DTypes | str] | str):
+    """Selects specified columns from the DataFrame."""
     if isinstance(columns, str):
         yield from select_columns_by_regex_pattern(df, columns)
 

diff --git a/sklearo/validation.py b/sklearo/validation.py
@@ -1,9 +1,11 @@
-from functools import wraps
+"""Validation utilities for sklearo."""
 
-from sklearo.utils import infer_target_type
+from functools import wraps
 
 
 def check_X_y(func):
+    """Decorator to check the input data X and y."""
+
     @wraps(func)
     def wrapper(self, X, y, *args, **kwargs):
         if not X.shape[0] == y.shape[0]:
@@ -22,6 +24,8 @@ def wrapper(self, X, y, *args, **kwargs):
 
 
 def check_if_fitted(func):
+    """Decorator to check if the model is fitted before calling the method."""
+
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         if not any(

diff --git a/tests/encoding/test_target.py b/tests/encoding/test_target.py
@@ -43,7 +43,6 @@ def test_target_encoder_fit_transform_comparison_with_scikit_learn():
     "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
 )
 class TestTargetEncoder:
-
     @pytest.fixture
     def binary_class_data(self):
         data = {
@@ -708,7 +707,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame):
             encoder.fit(binary_class_data[["category"]], binary_class_data["target"])
 
     def test_target_encoder_fit_transform(self, binary_class_data, DataFrame):
-
         binary_class_data = DataFrame(
             {
                 "category": binary_class_data["category"] * 2,
@@ -746,7 +744,6 @@ def test_target_encoder_fit_transform(self, binary_class_data, DataFrame):
     def test_target_encoder_fit_transform_set_smoothing(
         self, binary_class_data, DataFrame
     ):
-
         binary_class_data = DataFrame(
             {
                 "category": binary_class_data["category"] * 2,

diff --git a/tests/encoding/test_woe.py b/tests/encoding/test_woe.py
@@ -10,7 +10,6 @@
     "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
 )
 class TestWOEEncoder:
-
     @pytest.fixture
     def binary_class_data(self):
         # fmt: off
@@ -340,7 +339,7 @@ def test_woe_encoder_handle_missing_values_multi_class(
             missing_values="encode", underrepresented_categories="fill"
         )
         encoder.fit(multi_class_data[["category"]], multi_class_data["target"])
-        transformed = encoder.transform(multi_class_data[["category"]])
+        encoder.transform(multi_class_data[["category"]])
 
         assert "MISSING" in encoder.encoding_map_["category"][1]
 
@@ -520,7 +519,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame):
             encoder.fit(binary_class_data[["category"]], binary_class_data["target"])
 
     def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame):
-
         binary_class_data = DataFrame(
             {
                 "category": binary_class_data["category"] * 2,

diff --git a/tests/test_cv.py b/tests/test_cv.py
@@ -13,7 +13,6 @@
     "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
 )
 class TestCVFunctions:
-
     def test_add_cv_fold_id_column_k_fold(self, DataFrame):
         data = {
             "A": range(10),

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -19,7 +19,6 @@
     "DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
 )
 class TestSelectColumns:
-
     @pytest.fixture
     def sample_data(self):
         data = {
@@ -81,7 +80,6 @@ def test_select_columns_invalid_type(self, sample_data, DataFrame):
 
 @pytest.mark.parametrize("Series", [pd.Series, pl.Series], ids=["pandas", "polars"])
 class TestTypeOfTarget:
-
     @pytest.mark.parametrize(
         "data, expected",
         [
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""This package provides the main functionalities of the sklearo library."""