Skip to content

Commit

Permalink
Add ruff as a linter
Browse files Browse the repository at this point in the history
  • Loading branch information
ClaudioSalvatoreArcidiacono committed Jan 7, 2025
1 parent e27c918 commit c7cc083
Show file tree
Hide file tree
Showing 15 changed files with 68 additions and 26 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,12 @@ repos:
rev: 24.4.2
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.8.6
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,13 @@ exclude = '''

[tool.isort]
profile = "black"

[tool.ruff.lint]
select = ["E4", "E7", "E9", "F", "B", "D"]

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402"]
"**/{tests,docs,tools}/*" = ["D"]
1 change: 1 addition & 0 deletions sklearo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""This package provides the main functionalities of the sklearo library."""
8 changes: 7 additions & 1 deletion sklearo/base.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
"""This module provides base classes for transformers in the encoding process."""

from abc import ABC, abstractmethod

from narwhals.typing import IntoFrameT, IntoSeriesT


class BaseTransformer(ABC): # pragma: no cover
class BaseTransformer(ABC):
"""Abstract base class for all transformers."""

@abstractmethod
def fit(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> None:
"""Fits the transformer to the data."""
pass

@abstractmethod
def transform(self, X: IntoFrameT) -> IntoFrameT:
"""Transforms the data."""
pass

def fit_transform(self, X: IntoFrameT, y: IntoSeriesT | None = None) -> IntoFrameT:
"""Fits and transforms the data."""
self.fit(X, y)
return self.transform(X)
11 changes: 5 additions & 6 deletions sklearo/cv.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""This module provides cross-validation utilities for model evaluation."""

import math

import narwhals as nw
from narwhals.typing import IntoFrameT, IntoSeriesT


def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr:
"""
Perform ceiling division on a column.
"""Perform ceiling division on a column.
This function divides each element in the given column by the specified divisor
and returns the smallest integer greater than or equal to the result of the division.
Expand All @@ -22,8 +23,7 @@ def ceil_div(col: nw.Expr, divisor: int) -> nw.Expr:


def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT:
"""
Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation.
"""Add a column `fold_id` to the DataFrame indicating the fold ID for k-fold cross-validation.
This function divides the input DataFrame into k folds, ensuring that each fold
has approximately the same number of samples. The fold IDs are assigned in a way
Expand Down Expand Up @@ -82,8 +82,7 @@ def add_cv_fold_id_column_k_fold(X: IntoFrameT, k: int = 5) -> IntoFrameT:
def add_cv_fold_id_column_stratified_k_fold(
X: IntoFrameT, y: IntoSeriesT, k: int = 5
) -> IntoFrameT:
"""
Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV.
"""Add a `fold_id` column to the DataFrame indicating the fold ID for stratified k-fold CV.
This function ensures that each fold has approximately the same proportion of each class
as the original dataset. It calculates the fold IDs based on the distribution of the target
Expand Down
2 changes: 2 additions & 0 deletions sklearo/encoding/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""This module provides encoding techniques for categorical features."""

from .target import TargetEncoder
from .woe import WOEEncoder

Expand Down
13 changes: 10 additions & 3 deletions sklearo/encoding/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Base classes for encoders."""

import warnings
from abc import abstractmethod
from collections import defaultdict
Expand All @@ -15,14 +17,16 @@


class BaseOneToOneEncoder(BaseTransformer):
"""Base class for one-to-one encoders."""

def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT:
"""Handles missing values in the input data."""
if self.missing_values == "ignore":
return X
if self.missing_values == "raise":
if max(X[self.columns_].null_count().row(0)) > 0:
raise ValueError(
f"Some columns have missing values. "
"Some columns have missing values. "
"Please handle missing values before encoding or set "
"missing_values to either 'ignore' or 'encode'."
)
Expand All @@ -40,6 +44,7 @@ def _handle_missing_values(self, X: IntoFrameT) -> IntoFrameT:


class BaseTargetEncoder(BaseOneToOneEncoder):
"""Abstract base class for target encoders."""

@abstractmethod
def _calculate_target_statistic(
Expand All @@ -49,6 +54,7 @@ def _calculate_target_statistic(
raise NotImplementedError # pragma: no cover

def check_target_type(self, y: IntoSeriesT) -> str:
"""Check the type of the target variable."""
if hasattr(self, "target_type_"):
return
if not hasattr(self, "target_type") or self.target_type == "auto":
Expand All @@ -71,7 +77,6 @@ def fit(self, X: IntoFrameT, y: IntoSeriesT) -> "BaseTargetEncoder":
X (DataFrame): The input data.
y (Series): The target variable.
"""

self.check_target_type(y)
self.columns_ = list(select_columns(X, self.columns))
self.encoding_map_ = {}
Expand Down Expand Up @@ -163,7 +168,8 @@ def transform(self, X: IntoFrameT) -> IntoFrameT:
warnings.warn(
f"Unseen categories {unseen_per_col} found during transform. "
"Please handle unseen categories for example by using a RareLabelEncoder. "
f"These categories will be encoded as {self.fill_value_unseen}."
f"These categories will be encoded as {self.fill_value_unseen}.",
stacklevel=2,
)

if self.target_type_ in ("binary", "continuous"):
Expand All @@ -174,6 +180,7 @@ def transform(self, X: IntoFrameT) -> IntoFrameT:

@check_if_fitted
def get_feature_names_out(self) -> list[str]:
"""Get the output feature names."""
if self.target_type_ in ("binary", "continuous"):
return self.feature_names_in_

Expand Down
14 changes: 9 additions & 5 deletions sklearo/encoding/target.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Any, Literal, Sequence, Tuple
"""TargetEncoder class for encoding categorical features using the Target Encoding technique."""

from typing import Literal, Sequence

import narwhals as nw
from narwhals.typing import IntoFrameT
Expand All @@ -9,8 +11,7 @@


class TargetEncoder(BaseTargetEncoder):
"""
Target Encoder for categorical features.
"""Target Encoder for categorical features.
This class provides functionality to encode categorical features using the Target Encoding
technique. Target Encoding replaces each category with the mean of the target variable for that
Expand Down Expand Up @@ -51,6 +52,10 @@ class TargetEncoder(BaseTargetEncoder):
- If `'multiclass'`, the target variable is multiclass.
- If `'continuous'`, the target variable is continuous.
smooth (float, Literal["auto"]): Smoothing parameter to avoid overfitting. If `'auto'`, the
smoothing parameter is calculated based on the variance of the target variable.
Attributes:
columns_ (list[str]): List of columns to be encoded, learned during fit.
encoding_map_ (dict[str, float]): Mapping of categories to their mean target values, learned
Expand Down Expand Up @@ -98,7 +103,7 @@ def __init__(
smooth: Literal["auto"] | float = "auto",
cv: Annotated[int, Field(ge=2)] = 5,
) -> None:

"""Class constructor for TargetEncoder."""
self.columns = columns
self.missing_values = missing_values
self.unseen = unseen
Expand All @@ -112,7 +117,6 @@ def __init__(
def _calculate_target_statistic(
self, x_y: IntoFrameT, target_col: str, column: str
) -> dict:

if column in (
"count_per_category",
"sum_target_per_category",
Expand Down
3 changes: 3 additions & 0 deletions sklearo/encoding/woe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""This module provides the Weight of Evidence (WoE) encoding techniques for categorical features."""

import math
from typing import Literal, Sequence

Expand Down Expand Up @@ -143,6 +145,7 @@ def __init__(
missing_values: Literal["encode", "ignore", "raise"] = "encode",
cv: Annotated[int, Field(ge=2)] = 5,
) -> None:
"""Initializes the WoEEncoder with the specified parameters."""
self.columns = columns
self.underrepresented_categories = underrepresented_categories
self.missing_values = missing_values
Expand Down
5 changes: 5 additions & 0 deletions sklearo/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Utility functions for the sklearo package."""

import inspect
import re
from typing import Sequence
Expand All @@ -20,18 +22,21 @@


def select_columns_by_regex_pattern(df: nw.DataFrame, pattern: str):
"""Selects columns from the DataFrame that match the given regex pattern."""
for column in df.columns:
if re.search(pattern, column):
yield column


def select_columns_by_types(df: nw.DataFrame, dtypes: list[nw.dtypes.DType]):
"""Selects columns from the DataFrame that match the specified data types."""
for column, dtype in zip(df.schema.names(), df.schema.dtypes()):
if dtype in dtypes:
yield column


def select_columns(df: nw.DataFrame, columns: Sequence[nw.typing.DTypes | str] | str):
"""Selects specified columns from the DataFrame."""
if isinstance(columns, str):
yield from select_columns_by_regex_pattern(df, columns)

Expand Down
8 changes: 6 additions & 2 deletions sklearo/validation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from functools import wraps
"""Validation utilities for sklearo."""

from sklearo.utils import infer_target_type
from functools import wraps


def check_X_y(func):
"""Decorator to check the input data X and y."""

@wraps(func)
def wrapper(self, X, y, *args, **kwargs):
if not X.shape[0] == y.shape[0]:
Expand All @@ -22,6 +24,8 @@ def wrapper(self, X, y, *args, **kwargs):


def check_if_fitted(func):
"""Decorator to check if the model is fitted before calling the method."""

@wraps(func)
def wrapper(self, *args, **kwargs):
if not any(
Expand Down
3 changes: 0 additions & 3 deletions tests/encoding/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def test_target_encoder_fit_transform_comparison_with_scikit_learn():
"DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
)
class TestTargetEncoder:

@pytest.fixture
def binary_class_data(self):
data = {
Expand Down Expand Up @@ -708,7 +707,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame):
encoder.fit(binary_class_data[["category"]], binary_class_data["target"])

def test_target_encoder_fit_transform(self, binary_class_data, DataFrame):

binary_class_data = DataFrame(
{
"category": binary_class_data["category"] * 2,
Expand Down Expand Up @@ -746,7 +744,6 @@ def test_target_encoder_fit_transform(self, binary_class_data, DataFrame):
def test_target_encoder_fit_transform_set_smoothing(
self, binary_class_data, DataFrame
):

binary_class_data = DataFrame(
{
"category": binary_class_data["category"] * 2,
Expand Down
4 changes: 1 addition & 3 deletions tests/encoding/test_woe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
)
class TestWOEEncoder:

@pytest.fixture
def binary_class_data(self):
# fmt: off
Expand Down Expand Up @@ -340,7 +339,7 @@ def test_woe_encoder_handle_missing_values_multi_class(
missing_values="encode", underrepresented_categories="fill"
)
encoder.fit(multi_class_data[["category"]], multi_class_data["target"])
transformed = encoder.transform(multi_class_data[["category"]])
encoder.transform(multi_class_data[["category"]])

assert "MISSING" in encoder.encoding_map_["category"][1]

Expand Down Expand Up @@ -520,7 +519,6 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame):
encoder.fit(binary_class_data[["category"]], binary_class_data["target"])

def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame):

binary_class_data = DataFrame(
{
"category": binary_class_data["category"] * 2,
Expand Down
1 change: 0 additions & 1 deletion tests/test_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
)
class TestCVFunctions:

def test_add_cv_fold_id_column_k_fold(self, DataFrame):
data = {
"A": range(10),
Expand Down
2 changes: 0 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"DataFrame", [pd.DataFrame, pl.DataFrame], ids=["pandas", "polars"]
)
class TestSelectColumns:

@pytest.fixture
def sample_data(self):
data = {
Expand Down Expand Up @@ -81,7 +80,6 @@ def test_select_columns_invalid_type(self, sample_data, DataFrame):

@pytest.mark.parametrize("Series", [pd.Series, pl.Series], ids=["pandas", "polars"])
class TestTypeOfTarget:

@pytest.mark.parametrize(
"data, expected",
[
Expand Down

0 comments on commit c7cc083

Please sign in to comment.