Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@
"bug",
"doc"
]
},
{
"login": "paramsureliya",
"name": "Param Sureliya",
"profile": "https://github.com/paramsureliya",
"contributions": ["code"]
}
]
}
130 changes: 130 additions & 0 deletions skpro/regression/_dist_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)
"""Distribution string normalisation utility for skpro regressors.

Provides ``_normalize_dist_str``, which maps all known string aliases for a
probability distribution to the canonical capitalized class name used in skpro
(e.g. ``"gaussian"`` -> ``"Normal"``, ``"t"`` -> ``"TDistribution"``).

Every probabilistic regressor adapter should call this before its own internal
string -> object mapping so that users can pass any reasonable alias and have it
work uniformly across regressors and in GridSearchCV / RandomizedSearchCV.
"""

_DIST_ALIAS_MAP: dict[str, str] = {
# Normal / Gaussian
"normal": "Normal",
"gaussian": "Normal",
"norm": "Normal",
# Laplace
"laplace": "Laplace",
"double_exponential": "Laplace",
# LogNormal
"lognormal": "LogNormal",
"log_normal": "LogNormal",
"log-normal": "LogNormal",
"log normal": "LogNormal",
# TDistribution
"tdistribution": "TDistribution",
"t_distribution": "TDistribution",
"t-distribution": "TDistribution",
"t": "TDistribution",
"student_t": "TDistribution",
"studentt": "TDistribution",
"student-t": "TDistribution",
# Poisson
"poisson": "Poisson",
# Exponential
"exponential": "Exponential",
"exp": "Exponential",
# Gamma
"gamma": "Gamma",
# Beta
"beta": "Beta",
# Weibull
"weibull": "Weibull",
# Cauchy
"cauchy": "Cauchy",
# Binomial
"binomial": "Binomial",
"binom": "Binomial",
# NegativeBinomial
"negativebinomial": "NegativeBinomial",
"negative_binomial": "NegativeBinomial",
"negative.binomial": "NegativeBinomial",
"negbinomial": "NegativeBinomial",
"negbin": "NegativeBinomial",
"neg_binomial": "NegativeBinomial",
# InverseGaussian
"inversegaussian": "InverseGaussian",
"inverse_gaussian": "InverseGaussian",
"inverse.gaussian": "InverseGaussian",
"inv_gaussian": "InverseGaussian",
# Tweedie
"tweedie": "Tweedie",
# Logistic / SinhLogistic (QPD inner distributions used by CyclicBoosting)
"logistic": "Logistic",
"sinhlogistic": "SinhLogistic",
"sinh_logistic": "SinhLogistic",
"sinh-logistic": "SinhLogistic",
}


def _normalize_dist_str(dist: str) -> str:
"""Normalize a distribution string to the canonical capitalized class name.

Maps every known alias (case-insensitive) to the capitalized skpro class
name, e.g.::

_normalize_dist_str("gaussian") -> "Normal"
_normalize_dist_str("t") -> "TDistribution"
_normalize_dist_str("lognormal") -> "LogNormal"
_normalize_dist_str("Normal") -> "Normal" # already canonical

Non-string inputs (e.g. a distribution class or object) are returned
unchanged so callers do not need to guard separately.

Unknown strings emit a ``UserWarning`` and are returned as-is to preserve
backward-compatibility with any existing library-specific aliases.

Parameters
----------
dist : str
Distribution name in any accepted format.

Returns
-------
str
Canonical distribution name (capitalised class name in skpro).

Examples
--------
>>> from skpro.regression._dist_utils import _normalize_dist_str
>>> _normalize_dist_str("gaussian")
'Normal'
>>> _normalize_dist_str("lognormal")
'LogNormal'
>>> _normalize_dist_str("t")
'TDistribution'
>>> _normalize_dist_str("Normal")
'Normal'
"""
if not isinstance(dist, str):
return dist

lower = dist.lower()

# 1. Direct alias lookup (handles the vast majority of cases)
if lower in _DIST_ALIAS_MAP:
return _DIST_ALIAS_MAP[lower]

# 2. Unknown — warn but do not raise (preserves backward-compatibility)
import warnings

warnings.warn(
f"Distribution string '{dist}' is not recognised by _normalize_dist_str "
f"and will be passed through unchanged. If this is intentional, consider "
f"adding it to _DIST_ALIAS_MAP in skpro/regression/_dist_utils.py.",
UserWarning,
stacklevel=2,
)
return dist
24 changes: 17 additions & 7 deletions skpro/regression/adapters/ngboost/_ngboost_proba.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

__author__ = ["ShreeshaM07"]

from skpro.regression._dist_utils import _normalize_dist_str


class NGBoostAdapter:
"""Adapter to interconvert NGBoost and skpro BaseDistributions.
Expand Down Expand Up @@ -34,6 +36,9 @@ def _dist_to_ngboost_instance(self, dist, survival=False):
"""
from ngboost.distns import Exponential, Laplace, LogNormal, Normal, Poisson, T

# normalize aliases like "gaussian" -> "Normal", "lognormal" -> "LogNormal"
dist = _normalize_dist_str(dist)

ngboost_dists = {
"Normal": Normal,
"Laplace": Laplace,
Expand Down Expand Up @@ -76,6 +81,8 @@ def _ngb_skpro_dist_params(
# Exponential | scale = 1/rate
# Normal, Laplace, TDistribution and Poisson have not yet
# been implemented for Survival analysis.
# normalize aliases so dict lookups below always use canonical names
dist = _normalize_dist_str(self.dist)

dist_params = {
"Normal": ["loc", "scale"],
Expand All @@ -95,14 +102,14 @@ def _ngb_skpro_dist_params(
"Exponential": ["rate"],
}

if self.dist in dist_params and self.dist in skpro_params:
ngboost_params = dist_params[self.dist]
skp_params = skpro_params[self.dist]
if dist in dist_params and dist in skpro_params:
ngboost_params = dist_params[dist]
skp_params = skpro_params[dist]
for ngboost_param, skp_param in zip(ngboost_params, skp_params):
kwargs[skp_param] = pred_dist.params[ngboost_param]
if self.dist == "LogNormal" and ngboost_param == "scale":
if dist == "LogNormal" and ngboost_param == "scale":
kwargs[skp_param] = np.log(pred_dist.params[ngboost_param])
if self.dist == "Exponential" and ngboost_param == "scale":
if dist == "Exponential" and ngboost_param == "scale":
kwargs[skp_param] = 1 / pred_dist.params[ngboost_param]

kwargs[skp_param] = self._check_y(y=kwargs[skp_param])
Expand Down Expand Up @@ -132,6 +139,9 @@ def _ngb_dist_to_skpro(self, **kwargs):
from skpro.distributions.poisson import Poisson
from skpro.distributions.t import TDistribution

# normalize aliases so dict lookup uses the canonical name
dist = _normalize_dist_str(self.dist)

ngboost_dists = {
"Normal": Normal,
"Laplace": Laplace,
Expand All @@ -143,7 +153,7 @@ def _ngb_dist_to_skpro(self, **kwargs):

skpro_dist = None

if self.dist in ngboost_dists:
skpro_dist = ngboost_dists[self.dist](**kwargs)
if dist in ngboost_dists:
skpro_dist = ngboost_dists[dist](**kwargs)

return skpro_dist
13 changes: 8 additions & 5 deletions skpro/regression/cyclic_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pandas as pd

from skpro.distributions.qpd import QPD_Johnson
from skpro.regression._dist_utils import _normalize_dist_str
from skpro.regression.base import BaseProbaRegressor


Expand Down Expand Up @@ -83,11 +84,11 @@ class CyclicBoosting(BaseProbaRegressor):
be on a bounded interval, with support between ``lower`` and ``upper``.
maximal_iterations : int, default=10
maximum number of iterations for the cyclic boosting algorithm
dist_type: str, one of ``'normal'`` (default), ``'logistic'``
dist_type: str, default ``'Normal'``
inner base distribution to use for the Johnson QPD, i.e., before
arcosh and similar transformations.
Available options are ``'normal'`` (default), ``'logistic'``,
or ``'sinhlogistic'``.
arcosh and similar transformations. Common aliases are accepted for backwards
compatibility. Available options: ``"Normal"``, ``"Logistic"``,
``"SinhLogistic"``.

Attributes
----------
Expand Down Expand Up @@ -308,14 +309,16 @@ def _predict_proba(self, X):
self.quantile_values.append(yhat)

# Johnson Quantile-Parameterized Distributions
# normalize alias to canonical name, then lowercase for QPD_Johnson
dist_type = _normalize_dist_str(self.dist_type).lower()
params = {
"alpha": self.alpha,
"qv_low": self.quantile_values[0].reshape(-1, 1),
"qv_median": self.quantile_values[1].reshape(-1, 1),
"qv_high": self.quantile_values[2].reshape(-1, 1),
"lower": self.lower,
"upper": self.upper,
"base_dist": self.dist_type,
"base_dist": dist_type,
"index": index,
"columns": y_cols,
}
Expand Down
16 changes: 6 additions & 10 deletions skpro/regression/ensemble/_ngboost.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Adapters to ngboost regressors with probabilistic components."""

# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

__author__ = ["ShreeshaM07"]
Expand All @@ -18,16 +19,11 @@ class NGBoostRegressor(BaseProbaRegressor, NGBoostAdapter):
Parameters
----------
dist : string , default = "Normal"
distribution that must be used for
probabilistic prediction.
Available distribution types

1. "Normal"
2. "Laplace"
3. "LogNormal"
4. "Poisson"
5. "TDistribution"
6. "Exponential"
Distribution for probabilistic prediction. The canonical skpro class
name should be passed. Common aliases are accepted for backwards
compatibility. Available options: ``"Normal"``, ``"Laplace"``,
``"LogNormal"``, ``"Poisson"``, ``"TDistribution"``,
``"Exponential"``.

score : string , default = "LogScore"
A score from ngboost.scores for LogScore
Expand Down
33 changes: 11 additions & 22 deletions skpro/regression/gam/_gam.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from skpro.distributions.gamma import Gamma
from skpro.distributions.normal import Normal
from skpro.distributions.poisson import Poisson
from skpro.regression._dist_utils import _normalize_dist_str
from skpro.regression.base import BaseProbaRegressor


Expand All @@ -31,14 +32,10 @@ class GAMRegressor(BaseProbaRegressor):
Can be a ``pygam`` terms expression for custom model specification.

distribution : str or pygam.Distribution, optional (default='Normal')
Distribution family to use in the model.
Supported strings (case-insensitive):

* ``'Normal'`` or ``'Gaussian'`` - Normal/Gaussian distribution
* ``'Poisson'`` - Poisson distribution for count data
* ``'Gamma'`` - Gamma distribution for positive continuous data
* ``'Binomial'`` - Binomial distribution for binary/proportion data

Distribution family to use in the model. The canonical skpro class
name should be passed. Common aliases are accepted for backwards
compatibility. Available options: ``"Normal"``, ``"Poisson"``,
``"Gamma"``, ``"Binomial"``.
Alternatively, can pass a ``pygam.Distribution`` object directly.

link : str or pygam.Link, optional (default='identity')
Expand Down Expand Up @@ -159,20 +156,12 @@ def _fit(self, X, y):
callbacks = ["deviance", "diffs"]

dist_name = self._get_distribution_name(self.distribution)

# Map common names to skpro distribution names
dist_map = {
"normal": "normal",
"gaussian": "normal",
"poisson": "poisson",
"gamma": "gamma",
"binomial": "binomial",
"normaldist": "normal",
"poissondist": "poisson",
"gammadist": "gamma",
"binomialdist": "binomial",
}
dist_name = dist_map.get(dist_name, "normal")
# normalize to canonical skpro name, then lowercase for pygam
dist_name = _normalize_dist_str(dist_name).lower()
# pygam only supports these; fall back to "normal" if unrecognised
_pygam_supported = {"normal", "poisson", "gamma", "binomial"}
if dist_name not in _pygam_supported:
dist_name = "normal"

self._dist_name = dist_name

Expand Down
7 changes: 7 additions & 0 deletions skpro/regression/linear/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd

from skpro.regression._dist_utils import _normalize_dist_str
from skpro.regression.base import BaseProbaRegressor


Expand All @@ -25,6 +26,8 @@ class GLMRegressor(BaseProbaRegressor):
family : string, default : "Normal"
The family parameter denotes the type of distribution
that will be used.
The canonical skpro class name should be passed.
Common aliases are accepted for backwards compatibility.
Available family/distributions are
1."Normal"
2."Poisson"
Expand Down Expand Up @@ -214,6 +217,8 @@ def _str_to_sm_family(self, family, link):
from warnings import warn

from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson

family = _normalize_dist_str(family)
from statsmodels.genmod.families.links import Identity, InversePower, Log, Sqrt

sm_fmly = {
Expand Down Expand Up @@ -440,6 +445,8 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family):
from skpro.distributions.normal import Normal
from skpro.distributions.poisson import Poisson

family = _normalize_dist_str(family)

skpro_distr = {
"Normal": Normal,
"Poisson": Poisson,
Expand Down
Loading