diff --git a/.all-contributorsrc b/.all-contributorsrc index 9821a70c4..ed004b90e 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -264,6 +264,12 @@ "bug", "doc" ] + }, + { + "login": "paramsureliya", + "name": "Param Sureliya", + "profile": "https://github.com/paramsureliya", + "contributions": ["code"] } ] } diff --git a/skpro/regression/_dist_utils.py b/skpro/regression/_dist_utils.py new file mode 100644 index 000000000..981f28c91 --- /dev/null +++ b/skpro/regression/_dist_utils.py @@ -0,0 +1,130 @@ +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +"""Distribution string normalisation utility for skpro regressors. + +Provides ``_normalize_dist_str``, which maps all known string aliases for a +probability distribution to the canonical capitalized class name used in skpro +(e.g. ``"gaussian"`` -> ``"Normal"``, ``"t"`` -> ``"TDistribution"``). + +Every probabilistic regressor adapter should call this before its own internal +string -> object mapping so that users can pass any reasonable alias and have it +work uniformly across regressors and in GridSearchCV / RandomizedSearchCV. +""" + +_DIST_ALIAS_MAP: dict[str, str] = { + # Normal / Gaussian + "normal": "Normal", + "gaussian": "Normal", + "norm": "Normal", + # Laplace + "laplace": "Laplace", + "double_exponential": "Laplace", + # LogNormal + "lognormal": "LogNormal", + "log_normal": "LogNormal", + "log-normal": "LogNormal", + "log normal": "LogNormal", + # TDistribution + "tdistribution": "TDistribution", + "t_distribution": "TDistribution", + "t-distribution": "TDistribution", + "t": "TDistribution", + "student_t": "TDistribution", + "studentt": "TDistribution", + "student-t": "TDistribution", + # Poisson + "poisson": "Poisson", + # Exponential + "exponential": "Exponential", + "exp": "Exponential", + # Gamma + "gamma": "Gamma", + # Beta + "beta": "Beta", + # Weibull + "weibull": "Weibull", + # Cauchy + "cauchy": "Cauchy", + # Binomial + "binomial": "Binomial", + "binom": "Binomial", + # NegativeBinomial + "negativebinomial": "NegativeBinomial", + "negative_binomial": "NegativeBinomial", + "negative.binomial": "NegativeBinomial", + "negbinomial": "NegativeBinomial", + "negbin": "NegativeBinomial", + "neg_binomial": "NegativeBinomial", + # InverseGaussian + "inversegaussian": "InverseGaussian", + "inverse_gaussian": "InverseGaussian", + "inverse.gaussian": "InverseGaussian", + "inv_gaussian": "InverseGaussian", + # Tweedie + "tweedie": "Tweedie", + # Logistic / SinhLogistic (QPD inner distributions used by CyclicBoosting) + "logistic": "Logistic", + "sinhlogistic": "SinhLogistic", + "sinh_logistic": "SinhLogistic", + "sinh-logistic": "SinhLogistic", +} + + +def _normalize_dist_str(dist: str) -> str: + """Normalize a distribution string to the canonical capitalized class name. + + Maps every known alias (case-insensitive) to the capitalized skpro class + name, e.g.:: + + _normalize_dist_str("gaussian") -> "Normal" + _normalize_dist_str("t") -> "TDistribution" + _normalize_dist_str("lognormal") -> "LogNormal" + _normalize_dist_str("Normal") -> "Normal" # already canonical + + Non-string inputs (e.g. a distribution class or object) are returned + unchanged so callers do not need to guard separately. + + Unknown strings emit a ``UserWarning`` and are returned as-is to preserve + backward-compatibility with any existing library-specific aliases. + + Parameters + ---------- + dist : str + Distribution name in any accepted format. + + Returns + ------- + str + Canonical distribution name (capitalised class name in skpro). + + Examples + -------- + >>> from skpro.regression._dist_utils import _normalize_dist_str + >>> _normalize_dist_str("gaussian") + 'Normal' + >>> _normalize_dist_str("lognormal") + 'LogNormal' + >>> _normalize_dist_str("t") + 'TDistribution' + >>> _normalize_dist_str("Normal") + 'Normal' + """ + if not isinstance(dist, str): + return dist + + lower = dist.lower() + + # 1. Direct alias lookup (handles the vast majority of cases) + if lower in _DIST_ALIAS_MAP: + return _DIST_ALIAS_MAP[lower] + + # 2. Unknown — warn but do not raise (preserves backward-compatibility) + import warnings + + warnings.warn( + f"Distribution string '{dist}' is not recognised by _normalize_dist_str " + f"and will be passed through unchanged. If this is intentional, consider " + f"adding it to _DIST_ALIAS_MAP in skpro/regression/_dist_utils.py.", + UserWarning, + stacklevel=2, + ) + return dist diff --git a/skpro/regression/adapters/ngboost/_ngboost_proba.py b/skpro/regression/adapters/ngboost/_ngboost_proba.py index 2d620b095..dbf2f3a79 100644 --- a/skpro/regression/adapters/ngboost/_ngboost_proba.py +++ b/skpro/regression/adapters/ngboost/_ngboost_proba.py @@ -3,6 +3,8 @@ __author__ = ["ShreeshaM07"] +from skpro.regression._dist_utils import _normalize_dist_str + class NGBoostAdapter: """Adapter to interconvert NGBoost and skpro BaseDistributions. @@ -34,6 +36,9 @@ def _dist_to_ngboost_instance(self, dist, survival=False): """ from ngboost.distns import Exponential, Laplace, LogNormal, Normal, Poisson, T + # normalize aliases like "gaussian" -> "Normal", "lognormal" -> "LogNormal" + dist = _normalize_dist_str(dist) + ngboost_dists = { "Normal": Normal, "Laplace": Laplace, @@ -76,6 +81,8 @@ def _ngb_skpro_dist_params( # Exponential | scale = 1/rate # Normal, Laplace, TDistribution and Poisson have not yet # been implemented for Survival analysis. + # normalize aliases so dict lookups below always use canonical names + dist = _normalize_dist_str(self.dist) dist_params = { "Normal": ["loc", "scale"], @@ -95,14 +102,14 @@ def _ngb_skpro_dist_params( "Exponential": ["rate"], } - if self.dist in dist_params and self.dist in skpro_params: - ngboost_params = dist_params[self.dist] - skp_params = skpro_params[self.dist] + if dist in dist_params and dist in skpro_params: + ngboost_params = dist_params[dist] + skp_params = skpro_params[dist] for ngboost_param, skp_param in zip(ngboost_params, skp_params): kwargs[skp_param] = pred_dist.params[ngboost_param] - if self.dist == "LogNormal" and ngboost_param == "scale": + if dist == "LogNormal" and ngboost_param == "scale": kwargs[skp_param] = np.log(pred_dist.params[ngboost_param]) - if self.dist == "Exponential" and ngboost_param == "scale": + if dist == "Exponential" and ngboost_param == "scale": kwargs[skp_param] = 1 / pred_dist.params[ngboost_param] kwargs[skp_param] = self._check_y(y=kwargs[skp_param]) @@ -132,6 +139,9 @@ def _ngb_dist_to_skpro(self, **kwargs): from skpro.distributions.poisson import Poisson from skpro.distributions.t import TDistribution + # normalize aliases so dict lookup uses the canonical name + dist = _normalize_dist_str(self.dist) + ngboost_dists = { "Normal": Normal, "Laplace": Laplace, @@ -143,7 +153,7 @@ def _ngb_dist_to_skpro(self, **kwargs): skpro_dist = None - if self.dist in ngboost_dists: - skpro_dist = ngboost_dists[self.dist](**kwargs) + if dist in ngboost_dists: + skpro_dist = ngboost_dists[dist](**kwargs) return skpro_dist diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index e536f6e47..c208d5551 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -20,6 +20,7 @@ import pandas as pd from skpro.distributions.qpd import QPD_Johnson +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -83,11 +84,11 @@ class CyclicBoosting(BaseProbaRegressor): be on a bounded interval, with support between ``lower`` and ``upper``. maximal_iterations : int, default=10 maximum number of iterations for the cyclic boosting algorithm - dist_type: str, one of ``'normal'`` (default), ``'logistic'`` + dist_type: str, default ``'Normal'`` inner base distribution to use for the Johnson QPD, i.e., before - arcosh and similar transformations. - Available options are ``'normal'`` (default), ``'logistic'``, - or ``'sinhlogistic'``. + arcosh and similar transformations. Common aliases are accepted for backwards + compatibility. Available options: ``"Normal"``, ``"Logistic"``, + ``"SinhLogistic"``. Attributes ---------- @@ -308,6 +309,8 @@ def _predict_proba(self, X): self.quantile_values.append(yhat) # Johnson Quantile-Parameterized Distributions + # normalize alias to canonical name, then lowercase for QPD_Johnson + dist_type = _normalize_dist_str(self.dist_type).lower() params = { "alpha": self.alpha, "qv_low": self.quantile_values[0].reshape(-1, 1), @@ -315,7 +318,7 @@ def _predict_proba(self, X): "qv_high": self.quantile_values[2].reshape(-1, 1), "lower": self.lower, "upper": self.upper, - "base_dist": self.dist_type, + "base_dist": dist_type, "index": index, "columns": y_cols, } diff --git a/skpro/regression/ensemble/_ngboost.py b/skpro/regression/ensemble/_ngboost.py index 94b286913..581fc9362 100644 --- a/skpro/regression/ensemble/_ngboost.py +++ b/skpro/regression/ensemble/_ngboost.py @@ -1,4 +1,5 @@ """Adapters to ngboost regressors with probabilistic components.""" + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = ["ShreeshaM07"] @@ -18,16 +19,11 @@ class NGBoostRegressor(BaseProbaRegressor, NGBoostAdapter): Parameters ---------- dist : string , default = "Normal" - distribution that must be used for - probabilistic prediction. - Available distribution types - - 1. "Normal" - 2. "Laplace" - 3. "LogNormal" - 4. "Poisson" - 5. "TDistribution" - 6. "Exponential" + Distribution for probabilistic prediction. The canonical skpro class + name should be passed. Common aliases are accepted for backwards + compatibility. Available options: ``"Normal"``, ``"Laplace"``, + ``"LogNormal"``, ``"Poisson"``, ``"TDistribution"``, + ``"Exponential"``. score : string , default = "LogScore" A score from ngboost.scores for LogScore diff --git a/skpro/regression/gam/_gam.py b/skpro/regression/gam/_gam.py index 21a80003d..753aa72f2 100644 --- a/skpro/regression/gam/_gam.py +++ b/skpro/regression/gam/_gam.py @@ -11,6 +11,7 @@ from skpro.distributions.gamma import Gamma from skpro.distributions.normal import Normal from skpro.distributions.poisson import Poisson +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -31,14 +32,10 @@ class GAMRegressor(BaseProbaRegressor): Can be a ``pygam`` terms expression for custom model specification. distribution : str or pygam.Distribution, optional (default='Normal') - Distribution family to use in the model. - Supported strings (case-insensitive): - - * ``'Normal'`` or ``'Gaussian'`` - Normal/Gaussian distribution - * ``'Poisson'`` - Poisson distribution for count data - * ``'Gamma'`` - Gamma distribution for positive continuous data - * ``'Binomial'`` - Binomial distribution for binary/proportion data - + Distribution family to use in the model. The canonical skpro class + name should be passed. Common aliases are accepted for backwards + compatibility. Available options: ``"Normal"``, ``"Poisson"``, + ``"Gamma"``, ``"Binomial"``. Alternatively, can pass a ``pygam.Distribution`` object directly. link : str or pygam.Link, optional (default='identity') @@ -159,20 +156,12 @@ def _fit(self, X, y): callbacks = ["deviance", "diffs"] dist_name = self._get_distribution_name(self.distribution) - - # Map common names to skpro distribution names - dist_map = { - "normal": "normal", - "gaussian": "normal", - "poisson": "poisson", - "gamma": "gamma", - "binomial": "binomial", - "normaldist": "normal", - "poissondist": "poisson", - "gammadist": "gamma", - "binomialdist": "binomial", - } - dist_name = dist_map.get(dist_name, "normal") + # normalize to canonical skpro name, then lowercase for pygam + dist_name = _normalize_dist_str(dist_name).lower() + # pygam only supports these; fall back to "normal" if unrecognised + _pygam_supported = {"normal", "poisson", "gamma", "binomial"} + if dist_name not in _pygam_supported: + dist_name = "normal" self._dist_name = dist_name diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index bb9e65f87..3979377d1 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -25,6 +26,8 @@ class GLMRegressor(BaseProbaRegressor): family : string, default : "Normal" The family parameter denotes the type of distribution that will be used. + The canonical skpro class name should be passed. + Common aliases are accepted for backwards compatibility. Available family/distributions are 1."Normal" 2."Poisson" @@ -214,6 +217,8 @@ def _str_to_sm_family(self, family, link): from warnings import warn from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson + + family = _normalize_dist_str(family) from statsmodels.genmod.families.links import Identity, InversePower, Log, Sqrt sm_fmly = { @@ -440,6 +445,8 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): from skpro.distributions.normal import Normal from skpro.distributions.poisson import Poisson + family = _normalize_dist_str(family) + skpro_distr = { "Normal": Normal, "Poisson": Poisson, diff --git a/skpro/regression/linear/_glum.py b/skpro/regression/linear/_glum.py index 77be264aa..89db5f170 100644 --- a/skpro/regression/linear/_glum.py +++ b/skpro/regression/linear/_glum.py @@ -11,6 +11,7 @@ from skpro.distributions.negative_binomial import NegativeBinomial from skpro.distributions.normal import Normal from skpro.distributions.poisson import Poisson +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -24,10 +25,11 @@ class GlumRegressor(BaseProbaRegressor): Parameters ---------- - family : str or ExponentialDispersionModel, default='normal' - The distributional assumption of the GLM. - One of: 'binomial', 'gamma', 'gaussian', 'inverse.gaussian', - 'normal', 'poisson', 'tweedie', 'negative.binomial'. + family : str or ExponentialDispersionModel, default='Normal' + The distributional assumption of the GLM. The canonical skpro class + name should be passed. Common aliases are accepted for backwards + compatibility. Available options: ``"Normal"``, ``"Poisson"``, + ``"Gamma"``, ``"NegativeBinomial"``. link : str or Link, default='auto' The link function of the GLM. If 'auto', the canonical link for the family is used. @@ -133,7 +135,7 @@ class GlumRegressor(BaseProbaRegressor): def __init__( self, - family="normal", + family="Normal", link="auto", alpha=None, l1_ratio=0, @@ -196,11 +198,11 @@ def __init__( @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator.""" - params1 = {"family": "normal"} - params2 = {"family": "gamma", "link": "log"} - params3 = {"family": "poisson"} - params4 = {"family": "negative.binomial"} - params5 = {"family": "normal", "alpha": 0.1, "l1_ratio": 0.5} + params1 = {"family": "Normal"} + params2 = {"family": "Gamma", "link": "log"} + params3 = {"family": "Poisson"} + params4 = {"family": "NegativeBinomial"} + params5 = {"family": "Normal", "alpha": 0.1, "l1_ratio": 0.5} return [params1, params2, params3, params4, params5] def _fit(self, X, y): @@ -312,30 +314,34 @@ def _predict_proba(self, X): The predicted distribution. """ mu = self._predict(X) - family = self.family - if isinstance(family, str): - family_str = family.lower() - else: - # If family is an object, we need to infer the type - # This is tricky, but let's assume string for now as per init - family_str = str(family).lower() + # handle glum's 'negative.binomial(theta)' string format before normalizing + family_raw = self.family + theta_from_str = None + if isinstance(family_raw, str) and "(" in family_raw: + try: + theta_from_str = float(family_raw.split("(")[1].split(")")[0]) + family_raw = family_raw.split("(")[0].strip() + except ValueError: + pass + + family = _normalize_dist_str(family_raw) - if "normal" in family_str or "gaussian" in family_str: + if family == "Normal": # Normal distribution # Variance = dispersion * v(mu) = dispersion * 1 = dispersion # So sigma = sqrt(dispersion) sigma = np.sqrt(self.dispersion_) return Normal(mu=mu, sigma=sigma, index=X.index, columns=self._y_cols) - elif "poisson" in family_str: + elif family == "Poisson": # Poisson distribution # skpro Poisson takes mu. # If dispersion != 1, it's not standard Poisson. # But skpro Poisson is standard. return Poisson(mu=mu, index=X.index, columns=self._y_cols) - elif "gamma" in family_str: + elif family == "Gamma": # Gamma distribution # mu = alpha / beta # var = alpha / beta^2 = dispersion * mu^2 @@ -345,24 +351,17 @@ def _predict_proba(self, X): beta = 1.0 / (self.dispersion_ * mu) return Gamma(alpha=alpha, beta=beta, index=X.index, columns=self._y_cols) - elif "negative.binomial" in family_str: + elif family == "NegativeBinomial": # Negative Binomial # var = mu + theta * mu^2 # skpro NB takes mu and alpha (where var = mu + mu^2/alpha) # So alpha_skpro = 1/theta_glum - # We need to extract theta from family string or object - # If family is string like 'negative.binomial(1.5)', theta is 1.5 - # If family is 'negative.binomial', theta is default 1.0? - theta = 1.0 - if "(" in family_str: - try: - theta = float(family_str.split("(")[1].split(")")[0]) - except ValueError: - pass - - # Also check if family_instance has theta + # theta extracted from string like 'negative.binomial(1.5)' + if theta_from_str is not None: + theta = theta_from_str + # family_instance theta takes precedence if available if hasattr(self.estimator_.family_instance, "theta"): theta = self.estimator_.family_instance.theta diff --git a/skpro/regression/ondil.py b/skpro/regression/ondil.py index 261f85a75..5364623d8 100644 --- a/skpro/regression/ondil.py +++ b/skpro/regression/ondil.py @@ -15,6 +15,7 @@ ``skpro.distributions`` object; otherwise an informative error is raised. """ +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -188,7 +189,7 @@ def _predict_proba(self, X): raise TypeError("Unrecognized predict output from ondil: %s" % e) # decide mapping based on requested distribution - dist = self.distribution + dist = _normalize_dist_str(self.distribution) # import skpro distributions lazily distr_mod = importlib.import_module("skpro.distributions") diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 54e09b4a7..3e1900f74 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -7,6 +7,7 @@ import pandas as pd from sklearn import clone +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor from skpro.utils.numpy import flatten_to_1D_if_colvector from skpro.utils.sklearn import prep_skl_df @@ -77,7 +78,9 @@ class ResidualDouble(BaseProbaRegressor): distr_type : str or BaseDistribution, default = "Normal" type of distribution to predict - str options are "Normal", "Laplace", "Cauchy", "t" + str options are ``"Normal"``, ``"Laplace"``, ``"Cauchy"``, + ``"TDistribution"``. Common aliases are accepted for backwards + compatibility. distr_loc_scale_name : tuple of length two, default = ("loc", "scale") names of the parameters in the distribution to use for location and scale @@ -298,7 +301,7 @@ def _predict_proba(self, X): est = self.estimator_ est_r = self.estimator_resid_ use_y_pred = self.use_y_pred - distr_type = self.distr_type + distr_type = _normalize_dist_str(self.distr_type) distr_loc_scale_name = self.distr_loc_scale_name distr_params = self.distr_params min_scale = self.min_scale @@ -343,7 +346,7 @@ def _predict_proba(self, X): distr_type = Laplace distr_loc_scale_name = ("mu", "scale") - elif distr_type in ["Cauchy", "t"]: + elif distr_type in ["Cauchy", "TDistribution"]: from skpro.distributions.t import TDistribution distr_type = TDistribution @@ -402,7 +405,7 @@ def get_test_params(cls, parameter_set="default"): "estimator_resid": RandomForestRegressor(), "min_scale": 1e-6, "use_y_pred": True, - "distr_type": "t", + "distr_type": "TDistribution", "distr_params": {"df": 3}, "cv": KFold(n_splits=3), } diff --git a/skpro/regression/tests/test_dist_utils.py b/skpro/regression/tests/test_dist_utils.py new file mode 100644 index 000000000..09bf3ebd5 --- /dev/null +++ b/skpro/regression/tests/test_dist_utils.py @@ -0,0 +1,260 @@ +"""Tests for _normalize_dist_str and its integration into NGBoostRegressor. + +These tests cover: +- Unit tests for ``_normalize_dist_str`` (no external dependencies needed) +- Cross-regressor alias consistency checks +- Integration tests for NGBoostRegressor (skipped if ngboost not installed) +""" + +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["paramsureliya"] + +import warnings + +import pytest + +# --------------------------------------------------------------------------- +# Unit tests for _normalize_dist_str +# --------------------------------------------------------------------------- + + +class TestNormalizeDistStr: + """Unit tests for the standalone _normalize_dist_str utility.""" + + @pytest.fixture(autouse=True) + def _import(self): + from skpro.regression._dist_utils import _normalize_dist_str + + self.fn = _normalize_dist_str + + @pytest.mark.parametrize( + "canonical", + [ + "Normal", + "Laplace", + "LogNormal", + "TDistribution", + "Poisson", + "Exponential", + "Gamma", + "Beta", + "Weibull", + "Cauchy", + "Binomial", + "NegativeBinomial", + "InverseGaussian", + "Tweedie", + ], + ) + def test_canonical_passthrough(self, canonical): + """Already-canonical names must be returned as-is.""" + assert self.fn(canonical) == canonical + + @pytest.mark.parametrize( + "alias", + ["normal", "gaussian", "Gaussian", "GAUSSIAN", "norm", "NORMAL"], + ) + def test_normal_aliases(self, alias): + """All common spellings of Normal/Gaussian map to 'Normal'.""" + assert self.fn(alias) == "Normal" + + @pytest.mark.parametrize( + "alias, expected", + [ + # Laplace + ("laplace", "Laplace"), + ("double_exponential", "Laplace"), + # LogNormal + ("lognormal", "LogNormal"), + ("log_normal", "LogNormal"), + ("log-normal", "LogNormal"), + ("log normal", "LogNormal"), + # TDistribution — plain "t" is used by ResidualDouble + ("t", "TDistribution"), + ("tdistribution", "TDistribution"), + ("t_distribution", "TDistribution"), + ("student_t", "TDistribution"), + ("studentt", "TDistribution"), + # Poisson + ("poisson", "Poisson"), + # Exponential + ("exponential", "Exponential"), + ("exp", "Exponential"), + # Gamma + ("gamma", "Gamma"), + # Beta + ("beta", "Beta"), + # Weibull + ("weibull", "Weibull"), + # Cauchy + ("cauchy", "Cauchy"), + # Binomial + ("binomial", "Binomial"), + ("binom", "Binomial"), + # NegativeBinomial — GlumRegressor uses dot-notation + ("negative.binomial", "NegativeBinomial"), + ("negative_binomial", "NegativeBinomial"), + ("negativebinomial", "NegativeBinomial"), + ("negbin", "NegativeBinomial"), + # InverseGaussian — GlumRegressor uses dot-notation + ("inverse.gaussian", "InverseGaussian"), + ("inverse_gaussian", "InverseGaussian"), + ("inversegaussian", "InverseGaussian"), + # Tweedie + ("tweedie", "Tweedie"), + ], + ) + def test_alias_mapping(self, alias, expected): + assert ( + self.fn(alias) == expected + ), f"_normalize_dist_str({alias!r}) should return {expected!r}" + + def test_non_string_passthrough(self): + """Non-string inputs (classes, objects, None) must pass through unchanged.""" + + class FakeDist: + pass + + obj = FakeDist() + assert self.fn(obj) is obj + assert self.fn(None) is None + assert self.fn(42) == 42 + + def test_unknown_string_warns_not_raises(self): + """An unrecognised string emits UserWarning but does not raise.""" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = self.fn("totally_unknown_dist_xyz") + assert result == "totally_unknown_dist_xyz" + assert len(w) == 1 + assert issubclass(w[0].category, UserWarning) + assert "totally_unknown_dist_xyz" in str(w[0].message) + + @pytest.mark.parametrize("alias", ["gaussian", "lognormal", "t", "Normal"]) + def test_idempotent(self, alias): + """Calling the function twice must give the same result as once.""" + once = self.fn(alias) + twice = self.fn(once) + assert once == twice, ( + f"_normalize_dist_str is not idempotent for {alias!r}: " + f"first call -> {once!r}, second call -> {twice!r}" + ) + + +# --------------------------------------------------------------------------- +# Cross-regressor alias consistency (no external deps) +# --------------------------------------------------------------------------- + + +class TestCrossRegressorAliasConsistency: + """The same alias must resolve to the same canonical name regardless of + which regressor calls _normalize_dist_str — this is the invariant that + makes GridSearchCV across regressors work. + """ + + def test_gaussian_always_means_normal(self): + from skpro.regression._dist_utils import _normalize_dist_str + + for alias in ("gaussian", "Gaussian", "GAUSSIAN", "normal", "Normal"): + assert ( + _normalize_dist_str(alias) == "Normal" + ), f"Expected 'Normal' for alias {alias!r}" + + def test_t_alias_consistent(self): + """'t' (used by ResidualDouble) must map to 'TDistribution'.""" + from skpro.regression._dist_utils import _normalize_dist_str + + for alias in ("t", "TDistribution", "tdistribution", "t_distribution"): + assert ( + _normalize_dist_str(alias) == "TDistribution" + ), f"Expected 'TDistribution' for alias {alias!r}" + + def test_glum_dot_notation_normalized(self): + """GlumRegressor uses dot-notation for two distributions.""" + from skpro.regression._dist_utils import _normalize_dist_str + + assert _normalize_dist_str("negative.binomial") == "NegativeBinomial" + assert _normalize_dist_str("inverse.gaussian") == "InverseGaussian" + + +# --------------------------------------------------------------------------- +# Integration tests — NGBoostRegressor accepts aliases end-to-end +# (skipped automatically if ngboost is not installed) +# --------------------------------------------------------------------------- + + +class TestNGBoostRegressorAliases: + """Verify that NGBoostRegressor accepts distribution string aliases. + + Skipped automatically if ngboost is not importable. To run locally:: + + pip install ngboost + pytest skpro/regression/tests/test_dist_utils.py -v + """ + + @pytest.fixture(autouse=True) + def _require_ngboost(self): + pytest.importorskip("ngboost", reason="ngboost not installed") + + @pytest.fixture + def xy(self): + from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split + + X, y = load_diabetes(return_X_y=True, as_frame=True) + X_tr, X_te, y_tr, _ = train_test_split(X, y, test_size=0.1, random_state=0) + return X_tr, X_te, y_tr + + @pytest.mark.parametrize( + "alias", + [ + "Normal", # canonical — must still work + "normal", # lowercase + "gaussian", # common synonym + "Gaussian", # mixed-case synonym + "Laplace", + "laplace", + "Poisson", + "poisson", + "LogNormal", + "lognormal", + "log_normal", + ], + ) + def test_fit_predict_with_alias(self, alias, xy): + """NGBoostRegressor(dist=alias) must fit and predict_proba without error.""" + from skpro.regression.ensemble import NGBoostRegressor + + X_tr, X_te, y_tr = xy + est = NGBoostRegressor(dist=alias, n_estimators=20, verbose=False) + est.fit(X_tr, y_tr) + y_pred = est.predict_proba(X_te) + assert y_pred is not None + + @pytest.mark.parametrize( + "alias_a, alias_b", + [ + ("Normal", "gaussian"), + ("Normal", "normal"), + ("LogNormal", "lognormal"), + ("Laplace", "laplace"), + ], + ) + def test_aliases_produce_same_distribution_type(self, alias_a, alias_b, xy): + """Two aliases for the same distribution must return the same output type.""" + from skpro.regression.ensemble import NGBoostRegressor + + X_tr, X_te, y_tr = xy + est_a = NGBoostRegressor(dist=alias_a, n_estimators=20, verbose=False) + est_b = NGBoostRegressor(dist=alias_b, n_estimators=20, verbose=False) + est_a.fit(X_tr, y_tr) + est_b.fit(X_tr, y_tr) + + pred_a = est_a.predict_proba(X_te) + pred_b = est_b.predict_proba(X_te) + + assert type(pred_a) is type(pred_b), ( + f"dist={alias_a!r} -> {type(pred_a).__name__}, " + f"dist={alias_b!r} -> {type(pred_b).__name__} — expected the same type" + ) diff --git a/skpro/regression/xgboostlss.py b/skpro/regression/xgboostlss.py index 4ff62772e..6df29c92b 100644 --- a/skpro/regression/xgboostlss.py +++ b/skpro/regression/xgboostlss.py @@ -4,6 +4,7 @@ from skbase.utils.dependencies import _check_soft_dependencies +from skpro.regression._dist_utils import _normalize_dist_str from skpro.regression.base import BaseProbaRegressor @@ -15,17 +16,12 @@ class XGBoostLSS(BaseProbaRegressor): Parameters ---------- dist: str, optional, default="Normal" - Form of predictive distribution, strings are same as in skpro. + Form of predictive distribution. The canonical name of the skpro + distribution class should be passed, e.g., ``"Normal"``, ``"Gamma"``. + Common aliases are accepted for backwards compatibility. - Valid options are: - - * "Normal": Normal distribution. - * "Gamma": Gamma distribution. - * "Laplace": Laplace distribution. - * "LogNormal": LogNormal distribution. - * "TDistribution": Student's T distribution. - * "Weibull": Weibull distribution. - * "Beta": Beta distribution. + Valid options are: ``"Normal"``, ``"Gamma"``, ``"Laplace"``, + ``"LogNormal"``, ``"TDistribution"``, ``"Weibull"``, ``"Beta"``. stabilization: str, optional, default="None" Stabilization method for the Gradient and Hessian. @@ -277,6 +273,8 @@ def _get_xgblss_distr(self, distr): """ import importlib + distr = _normalize_dist_str(distr) + SKPRO_TO_XGBLSS = { "Normal": "Gaussian", "TDistribution": "StudentT", @@ -300,6 +298,7 @@ def _get_skpro_distr(self, distr): """ import importlib + distr = _normalize_dist_str(distr) module_str = "skpro.distributions" object_str = distr @@ -318,6 +317,7 @@ def _get_skpro_val_dict(self, distr, df): df : pd.DataFrame DataFrame of parameters as returned by predict, in xgboostlss. """ + distr = _normalize_dist_str(distr) name_map = { "Normal": {"mu": "loc", "sigma": "scale"}, "Gamma": {"alpha": "concentration", "beta": "rate"}, diff --git a/skpro/survival/ensemble/_ngboost_surv.py b/skpro/survival/ensemble/_ngboost_surv.py index 31a89628e..0e8fd66bc 100644 --- a/skpro/survival/ensemble/_ngboost_surv.py +++ b/skpro/survival/ensemble/_ngboost_surv.py @@ -1,4 +1,5 @@ """class for NGBoost probabilistic survival regression.""" + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = ["ShreeshaM07"] @@ -21,12 +22,9 @@ class NGBoostSurvival(BaseSurvReg, NGBoostAdapter): Parameters ---------- dist : string , default = "LogNormal" - assumed distributional form of Y|X=x. - A distribution from ngboost.distns, e.g. LogNormal - Available distribution types: - - 1. "LogNormal" - 2. "Exponential" + assumed distributional form of Y|X=x. The canonical skpro class + name should be passed. Common aliases are accepted for backwards + compatibility. Available options: ``"LogNormal"``, ``"Exponential"``. score : string , default = "LogScore" rule to compare probabilistic predictions P̂ to the observed data y.