diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5ff24e13..b9065b11 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,10 +4,10 @@ jobs: test: name: Run unit tests runs-on: ubuntu-latest - timeout-minutes: 5 + timeout-minutes: 15 strategy: matrix: - python-version: ['3.10'] + python-version: ['3.11'] steps: - uses: actions/checkout@v2 - name: Setup Python diff --git a/README.md b/README.md index fcd98ee0..93258818 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ We have our own implementation of ordinary least squares in Python because this ## Quantile Regression Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also. -## Transition matrices -We also have a solver for transition matrices. While this works arbitrarily, we have used this in the past for our primary election night model. We can still use this to create the sankey diagram coefficients. +## Transition Matrices +We also have a matrix regression solver built with `cvxpy`. We've used this for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams. ## Development We welcome contributions to this repo. Please open a Github issue for any issues or comments you have. diff --git a/setup.cfg b/setup.cfg index 000bb99c..70749624 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,4 +2,8 @@ universal = 1 [pycodestyle] -max-line-length = 160 \ No newline at end of file +max-line-length = 160 + +[pylint] +max-line-length = 160 +disable = invalid-name, duplicate-code, missing-function-docstring, too-many-instance-attributes, too-many-arguments, too-many-locals, missing-module-docstring \ No newline at end of file diff --git a/setup.py b/setup.py index 15340bfa..3a74a7a8 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.11"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.12"] THIS_FILE_DIR = os.path.dirname(__file__) @@ -29,7 +29,7 @@ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], description="A package for optimization solvers", long_description=LONG_DESCRIPTION, diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 8e29cad3..95e31252 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -1,28 +1,90 @@ +import logging +import warnings + import cvxpy as cp +import numpy as np + +from elexsolver.logging import initialize_logging +from elexsolver.TransitionSolver import TransitionSolver + +initialize_logging() + +LOG = logging.getLogger(__name__) -class TransitionMatrixSolver: - def __init__(self): - self.transition_matrix = None +class TransitionMatrixSolver(TransitionSolver): + """ + Matrix regression transition solver using CVXPY. + """ + + def __init__(self, strict: bool = True, lam: float | None = None): + """ + Parameters + ---------- + strict : bool, default True + If `True`, solution will be constrainted so that all coefficients are >= 0, + <= 1, and the sum of each row equals 1. + lam : float, optional + `lam != 0` will enable L2 regularization (Ridge). + """ + super().__init__() + self._strict = strict + self._lambda = lam @staticmethod - def __get_constraint(X, strict): + def __get_constraints(coef: np.ndarray, strict: bool) -> list: if strict: - return [cp.sum(X, axis=1) == 1] - return [cp.sum(X, axis=1) <= 1.1, cp.sum(X, axis=1) >= 0.9] - - def __solve(self, A, B, strict): - transition_matrix = cp.Variable((A.shape[1], B.shape[1])) - loss_function = cp.norm(A @ transition_matrix - B, "fro") - objective = cp.Minimize(loss_function) - constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, strict) - problem = cp.Problem(objective, constraint) - problem.solve() + return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] + return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] + + def __standard_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize: + loss_function = cp.norm(A @ beta - B, "fro") + return cp.Minimize(loss_function) + + def __ridge_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize: + # Based on https://www.cvxpy.org/examples/machine_learning/ridge_regression.html + lam = cp.Parameter(nonneg=True, value=self._lambda) + loss_function = cp.pnorm(A @ beta - B, p=2) ** 2 + regularizer = cp.pnorm(beta, p=2) ** 2 + return cp.Minimize(loss_function + lam * regularizer) + + def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarray: + transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) + Aw = np.dot(weights, A) + Bw = np.dot(weights, B) + + if self._lambda is None or self._lambda == 0: + objective = self.__standard_objective(Aw, Bw, transition_matrix) + else: + objective = self.__ridge_objective(Aw, Bw, transition_matrix) + + constraints = TransitionMatrixSolver.__get_constraints(transition_matrix, self._strict) + problem = cp.Problem(objective, constraints) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + problem.solve(solver=cp.CLARABEL) + except (UserWarning, cp.error.SolverError) as e: + raise RuntimeError(e) from e + return transition_matrix.value - def fit(self, A, B, strict=False): - transition_matrix = self.__solve(A, B, strict) - self.transition_matrix = transition_matrix + def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None) -> np.ndarray: + self._check_any_element_nan_or_inf(X) + self._check_any_element_nan_or_inf(Y) + self._check_for_zero_units(X) + self._check_for_zero_units(Y) + + if not isinstance(X, np.ndarray): + X = X.to_numpy() + if not isinstance(Y, np.ndarray): + Y = Y.to_numpy() + + if X.shape[0] != Y.shape[0]: + raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") + + weights = self._check_and_prepare_weights(X, Y, sample_weight) - def predict(self, A): - return A @ self.transition_matrix + self.coefficients = self.__solve(X, Y, weights) + return self diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py new file mode 100644 index 00000000..b55ffae9 --- /dev/null +++ b/src/elexsolver/TransitionSolver.py @@ -0,0 +1,95 @@ +import logging + +import numpy as np + +from elexsolver.LinearSolver import LinearSolver +from elexsolver.logging import initialize_logging + +initialize_logging() + +LOG = logging.getLogger(__name__) + + +class TransitionSolver(LinearSolver): + """ + Abstract class for transition solvers. + """ + + def __init__(self): + """ + After model-fit, `self.coefficients` will contain + the solved coefficients, an np.ndarray matrix of float of shape + (number of columns in `X`) x (number of columns in `Y`). + Each float represents the percent of how much of row x is part of column y. + """ + super().__init__() + + def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None): + """ + Parameters + ---------- + X : np.ndarray matrix or pandas.DataFrame of int + Must have the same number of rows as `Y` but can have any number of columns greater than the number of rows. + Y : np.ndarray matrix or pandas.DataFrame of int + Must have the same number of rows as `X` but can have any number of columns greater than the number of rows. + sample_weight : list or np.ndarray or pandas.Series of int, optional + Must have the same length (number of rows) as both `X` and `Y`. + + Returns + ------- + `self` and populates `betas` with the beta coefficients determined by this solver. + `betas` is an np.ndarray matrix of float of shape (number of columns in `X`) x (number of columns in `Y`). + Each float represents the percent of how much of row x is part of column y. + """ + raise NotImplementedError + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Parameters + ---------- + X : np.ndarray matrix or pandas.DataFrame of int + Must have the same dimensions as the `X` supplied to `fit()`. + + Returns + ------- + `Y_hat`, np.ndarray of float of the same shape as Y. + """ + if self.coefficients is None: + raise RuntimeError("Solver must be fit before prediction can be performed.") + + self._check_any_element_nan_or_inf(X) + + return X @ self.coefficients + + def _check_for_zero_units(self, A: np.ndarray): + """ + If we have at least one unit whose columns are all zero, most if not all of our solvers will fail. + """ + if np.any(np.sum(A, axis=1) == 0): + raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.") + + def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None) -> np.ndarray: + """ + If `weights` is not None, and `weights` has the same number of rows in both matrices `X` and `Y`, + we'll rescale the weights by taking the square root after dividing them by their sum, + then return a diagonal matrix containing these now-normalized weights. + If `weights` is None, return a diagonal matrix of ones. + + Parameters + ---------- + X : np.ndarray matrix of int (same number of rows as `Y`) + Y : np.ndarray matrix of int (same number of rows as `X`) + weights : np.ndarray of int of the shape (number of rows in `X` and `Y`, 1), optional + """ + + if weights is not None: + if len(weights) != X.shape[0] and len(weights) != Y.shape[0]: + raise ValueError("weights must be the same length as the number of rows in X and Y.") + if isinstance(weights, list): + weights = np.array(weights).copy() + elif not isinstance(weights, np.ndarray): + # pandas.Series + weights = weights.values.copy() + return np.diag(np.sqrt(weights.flatten() / weights.sum())) + + return np.diag(np.ones((Y.shape[0],))) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py new file mode 100644 index 00000000..5796c887 --- /dev/null +++ b/tests/test_transition_matrix_solver.py @@ -0,0 +1,280 @@ +import numpy as np +import pytest + +from elexsolver.TransitionMatrixSolver import TransitionMatrixSolver + +RTOL = 1e-04 +ATOL = 1e-04 + + +def test_matrix_fit_predict_with_integers(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_betas = np.array([[9.99831808e-01, 1.68191521e-04], [1.49085896e-04, 9.99850914e-01]]) + expected_yhat = np.array( + [ + [1.00012998, 1.99987002], + [3.00009177, 3.99990823], + [5.00005356, 5.99994644], + [7.00001535, 7.99998465], + [8.99997714, 10.00002286], + [10.99993892, 12.00006108], + ] + ) + + tms = TransitionMatrixSolver().fit(X, Y) + current_yhat = tms.predict(X) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_yhat, current_yhat, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ] + ) + + expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + expected_yhat = np.array( + [ + [0.39790396, 0.60209604], + [0.44969311, 0.55030689], + [0.46381742, 0.53618258], + [0.47040877, 0.52959123], + [0.47422481, 0.52577519], + [0.47671354, 0.52328646], + ] + ) + + tms = TransitionMatrixSolver().fit(X, Y) + current_yhat = tms.predict(X) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_yhat, current_yhat, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict_with_weights(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ] + ) + + weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) + + expected_betas = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) + + tms = TransitionMatrixSolver().fit(X, Y, sample_weight=weights) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict_not_strict(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ] + ) + + expected_betas = np.array([[0.760451, 0.239558], [0.216624, 0.783369]]) + + tms = TransitionMatrixSolver(strict=False).fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + + +def test_ridge_matrix_fit_predict(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ] + ) + + expected_betas = np.array([[0.479416, 0.520584], [0.455918, 0.544082]]) + + tms = TransitionMatrixSolver(lam=1).fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict_pivoted(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ).T + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ).T + + expected_betas = np.array( + [ + [9.99706e-01, 1.85000e-04, 5.00000e-05, 2.80000e-05, 1.90000e-05, 1.30000e-05], + [4.80000e-05, 9.99464e-01, 3.43000e-04, 8.10000e-05, 4.00000e-05, 2.40000e-05], + [1.70000e-05, 1.56000e-04, 9.99188e-01, 4.86000e-04, 1.06000e-04, 4.70000e-05], + [1.00000e-05, 4.60000e-05, 2.76000e-04, 9.98960e-01, 5.93000e-04, 1.14000e-04], + [7.00000e-06, 2.40000e-05, 7.40000e-05, 3.88000e-04, 9.98887e-01, 6.20000e-04], + [5.00000e-06, 1.50000e-05, 3.60000e-05, 9.70000e-05, 4.66000e-04, 9.99382e-01], + ] + ) + + tms = TransitionMatrixSolver().fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict_bad_dimensions(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + ] + ) + + tms = TransitionMatrixSolver() + with pytest.raises(ValueError): + tms.fit(X, Y) + + +def test_matrix_fit_predict_pandas(): + try: + import pandas # pylint: disable=import-outside-toplevel + + X = pandas.DataFrame( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ], + columns=["x1", "x2"], + ) + + Y = pandas.DataFrame( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ], + columns=["y1", "y2"], + ) + + expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + + tms = TransitionMatrixSolver().fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py new file mode 100644 index 00000000..b7bc9aa3 --- /dev/null +++ b/tests/test_transition_solver.py @@ -0,0 +1,131 @@ +from unittest.mock import patch + +import numpy as np +import pytest + +from elexsolver.TransitionSolver import TransitionSolver + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_fit(): + with pytest.raises(NotImplementedError): + ts = TransitionSolver() + ts.fit(None, None) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_predict(): + with pytest.raises(RuntimeError): + ts = TransitionSolver() + ts.predict(None) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_get_coefficients(): + ts = TransitionSolver() + assert ts.coefficients is None + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_any_element_nan_or_inf_with_nan(): + with pytest.raises(ValueError): + A = np.array([[0.1, 0.2, 0.3], [0.4, np.nan, 0.6]]) + ts = TransitionSolver() + ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_any_element_nan_or_inf_without_nan(): + A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + ts = TransitionSolver() + ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_for_zero_units_good(): + A = np.array( + [ + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + [0.7, 0.8, 0.9], + ] + ) + ts = TransitionSolver() + ts._check_for_zero_units(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_for_zero_units_bad(): + with pytest.raises(ValueError): + A = np.array( + [ + [0.1, 0.2, 0.3], + [0.0, 0.0, 0.0], + [0.7, 0.8, 0.9], + ] + ) + ts = TransitionSolver() + ts._check_for_zero_units(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_bad(): + with pytest.raises(ValueError): + weights = [1, 2] + A = np.array([[1, 2], [3, 4], [5, 6]]) + B = A.copy() + ts = TransitionSolver() + ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_none(): + A = np.array([[1, 2], [3, 4], [5, 6]]) + B = A.copy() + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, None) # pylint: disable=protected-access + np.testing.assert_array_equal(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = np.array([0.6, 0.4]) + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights_list(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = [0.6, 0.4] + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights_pandas(): + try: + import pandas # pylint: disable=import-outside-toplevel + + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = pandas.Series([0.6, 0.4]) + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True