Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] concat operation on distributions #499

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@
"code"
]
},
{
"login": "sairevanth25",
"name": "Sai Revanth Gowravajhala",
"avatar_url": "https://avatars.githubusercontent.com/SaiRevanth25",
"profile": "https://github.com/SaiRevanth25",
"contributions": [
"code"
]
},
{
"login": "malikrafsan",
"name": "Malik Akbar Hashemi Rafsanjani",
Expand Down
5 changes: 4 additions & 1 deletion skpro/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
"Beta",
"Binomial",
"ChiSquared",
"concat",
"ConcatDistr",
"Delta",
"Empirical",
"Exponential",
Expand Down Expand Up @@ -43,7 +45,8 @@
from skpro.distributions.beta import Beta
from skpro.distributions.binomial import Binomial
from skpro.distributions.chi_squared import ChiSquared
from skpro.distributions.compose import IID
from skpro.distributions.compose import IID, ConcatDistr
from skpro.distributions.concat import concat
from skpro.distributions.delta import Delta
from skpro.distributions.empirical import Empirical
from skpro.distributions.exponential import Exponential
Expand Down
6 changes: 5 additions & 1 deletion skpro/distributions/compose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)
# adapted from sktime

__all__ = ["IID"]
__all__ = [
"ConcatDistr",
"IID",
]

from skpro.distributions.compose._concat import ConcatDistr
from skpro.distributions.compose._iid import IID
297 changes: 297 additions & 0 deletions skpro/distributions/compose/_concat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)
"""Base class for concat operation."""

__author__ = ["SaiRevanth25"]

import pandas as pd

from skpro.base import BaseMetaEstimator
from skpro.distributions.base import BaseDistribution


class ConcatDistr(BaseMetaEstimator, BaseDistribution):
"""Concatenate the given distributions along specified axis.

Parameters
----------
distributions : list
list of distributions
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along
ignore_index : bool, default False
If True, do not use the index values along the concatenation axis.
The resulting axis will be labeled 0, ..., n - 1.
"""

# for default get_params/set_params from BaseMetaEstimator
# named_object_parameters points to the attribute of self
# which contains the heterogeneous set of estimators
# this must be an iterable of (name: str, estimator, ...) tuples for the default
# if the estimator is fittable, _BaseMetaEstimator also
# provides an override for get_fitted_params for params from the fitted estimators
# the fitted estimators should be in fitted_named_object_parameters
# this must be an iterable of (name: str, estimator, ...) tuples for the default
_tags = {
# packaging info
# --------------
"authors": ["SaiRevanth25", "fkiraly"],
#
# estimator tags
# --------------
"named_object_parameters": "_distributions",
"capabilities:approx": ["pdfnorm", "energy"],
"capabilities:exact": ["mean", "var", "pdf", "log_pdf", "cdf", "ppf"],
}

def __init__(
self,
distributions,
axis=0,
ignore_index=False,
index=None,
columns=None,
):
"""Initialize concat with list of distributions and axis for concatenation."""
self.distributions = distributions
self.axis = axis
self.ignore_index = ignore_index
self.index = index
self.columns = columns

self._distributions = self._coerce_to_named_object_tuples(distributions)

super().__init__(index=index, columns=columns)

def _concat(self, method, *args, **kwargs):
"""Concatenate the distributions along the specified axis."""
results = []
for distr in self.distributions:
results.append(getattr(distr, method)(*args, **kwargs))

if self.index is None and self.columns is None:
return pd.concat(results, axis=self.axis, ignore_index=self.ignore_index)
else:
concat_df = pd.concat(results, axis=self.axis, ignore_index=True)

if self.axis == 0 and self.columns is not None:
concat_df.columns = self.columns
elif self.axis == 1 and self.index is not None:
concat_df.index = self.index

return concat_df

def _mean(self):
"""Return expected value of the distribution.

Returns
-------
2D np.ndarray, same shape as ``self``
expected value of distribution (entry-wise)
"""
return self._concat("mean")

def _var(self):
r"""Return element/entry-wise variance of the distribution.

Returns
-------
2D np.ndarray, same shape as ``self``
variance of the distribution (entry-wise)
"""
return self._concat("var")

def _pdf(self, x):
"""Probability density function.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to evaluate the pdf at

Returns
-------
2D np.ndarray, same shape as ``self``
pdf values at the given points
"""
return self._concat("pdf", x=x)

def _log_pdf(self, x):
"""Logarithmic probability density function.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to evaluate the pdf at

Returns
-------
2D np.ndarray, same shape as ``self``
log pdf values at the given points
"""
return self._concat("log_pdf", x=x)

def _pmf(self, x):
"""Probability mass function.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to evaluate the pmf at

Returns
-------
2D np.ndarray, same shape as ``self``
pmf values at the given points
"""
return self._concat("pmf", x=x)

def _log_pmf(self, x):
"""Logarithmic probability mass function.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to evaluate the pmf at

Returns
-------
2D np.ndarray, same shape as ``self``
log pmf values at the given points
"""
return self._concat("log_pmf", x=x)

def _cdf(self, x):
"""Cumulative distribution function.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to evaluate the cdf at

Returns
-------
2D np.ndarray, same shape as ``self``
cdf values at the given points
"""
return self._concat("cdf", x=x)

def _ppf(self, p):
"""Quantile function = percent point function = inverse cdf.

Parameters
----------
p : 2D np.ndarray, same shape as ``self``
values to evaluate the ppf at

Returns
-------
2D np.ndarray, same shape as ``self``
ppf values at the given points
"""
return self._concat("ppf", p=p)

def _energy_self(self):
r"""Energy of self, w.r.t. self.

:math:`\mathbb{E}[|X-Y|]`, where :math:`X, Y` are i.i.d. copies of self.

Private method, to be implemented by subclasses.

Returns
-------
2D np.ndarray, same shape as ``self``
energy values w.r.t. the given points
"""
return self._concat("energy_self")

def _energy_x(self, x):
r"""Energy of self, w.r.t. a constant frame x.

:math:`\mathbb{E}[|X-x|]`, where :math:`X` is a copy of self,
and :math:`x` is a constant.

Private method, to be implemented by subclasses.

Parameters
----------
x : 2D np.ndarray, same shape as ``self``
values to compute energy w.r.t. to

Returns
-------
2D np.ndarray, same shape as ``self``
energy values w.r.t. the given points
"""
return self._concat("energy_x", x=x)

def sample(self, n_samples=None):
"""Sample from the distribution.

Parameters
----------
n_samples : int, optional, default = None
number of samples to draw from the distribution

Returns
-------
pd.DataFrame
samples from the distribution

* if ``n_samples`` is ``None``:
returns a sample that contains a single sample from ``self``,
in ``pd.DataFrame`` mtype format convention, with ``index`` and ``columns``
as ``self``
* if n_samples is ``int``:
returns a ``pd.DataFrame`` that contains ``n_samples`` i.i.d.
samples from ``self``, in ``pd-multiindex`` mtype format convention,
with same ``columns`` as ``self``, and row ``MultiIndex`` that is product
of ``RangeIndex(n_samples)`` and ``self.index``
"""
if n_samples is None:
return self._concat("sample")
else:
# we concat and sort
samples = self._concat("sample", n_samples=n_samples)
samples = samples.sort_index()
return samples

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from skpro.distributions import Normal, Uniform

n32 = Normal(0, 1, index=[0, 1, 2], columns=["a", "b"])
u32 = Uniform(-1, 1, index=[3, 4, 5], columns=["a", "b"])

params0 = {
"distributions": [n32, n32],
"axis": 1,
"ignore_index": True,
}
params1 = {
"distributions": [n32, u32],
"ignore_index": False,
}
params2 = {
"distributions": [n32, u32],
"axis": 1,
"index": [0, 2, 4],
"columns": ["a", "b", "foo", "bar"],
}

return [params0, params1, params2]
54 changes: 54 additions & 0 deletions skpro/distributions/concat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)
"""Concat operation."""

__author__ = ["SaiRevanth25"]

from skpro.distributions.compose._concat import ConcatDistr


def concat(objs, axis=0):
"""
Concatenate a list of distributions into a ConcatDistr.

Parameters
----------
objs : list
List of distribution-like objects to concatenate.
axis : {0/'index', 1/'columns'}, default 0
Axis to concatenate along.

Returns
-------
ConcatDistr
An object representing the concatenation of the given distributions.

Examples
--------
>>> import skpro.distributions as skpro
>>> d1 = Normal(mu=[[1, 2], [3, 4]], sigma=1)
>>> d2 = Normal(mu=0, sigma = [[2, 42]])
>>> skpro.concat([d1,d2]).mean()
0 1
0 1 2
1 3 4
2 0 0
>>> skpro.concat([d1,d2]).var()

0 1
0 1 1
1 1 1
2 4 1764
>>> d3 = Gamma(alpha=[[5, 2]], beta=4)
>>> d4 = Laplace(mu= [5,7], scale=[2,8])
>>> skpro.concat([d2,d3,d4]).pdf(x=1)
0 1
Normal 4.0000 1764.000
Gamma 0.3125 0.125
Laplace 8.0000 128.000
"""
if not isinstance(objs, list):
raise ValueError("`objs` must be a list of distribution-like objects.")
if axis not in [0, 1, "index", "columns"]:
raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}.")

return ConcatDistr(objs, axis=axis)
Loading