diff --git a/skpro/regression/linear/__init__.py b/skpro/regression/linear/__init__.py index bc7d9a41d..8dced82fc 100644 --- a/skpro/regression/linear/__init__.py +++ b/skpro/regression/linear/__init__.py @@ -5,11 +5,13 @@ from skpro.regression.linear._glm import GLMRegressor from skpro.regression.linear._glum import GlumRegressor from skpro.regression.linear._sklearn import ARDRegression, BayesianRidge +from skpro.regression.linear._sklearn_gamma import GammaRegressor from skpro.regression.linear._sklearn_poisson import PoissonRegressor __all__ = [ "ARDRegression", "BayesianRidge", + "GammaRegressor", "GLMRegressor", "GlumRegressor", "PoissonRegressor", diff --git a/skpro/regression/linear/_sklearn_gamma.py b/skpro/regression/linear/_sklearn_gamma.py new file mode 100644 index 000000000..071af75d0 --- /dev/null +++ b/skpro/regression/linear/_sklearn_gamma.py @@ -0,0 +1,241 @@ +"""Adapters to sklearn linear regressors with probabilistic components.""" + +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +# based on sktime pipelines + +import numpy as np +import pandas as pd + +from skpro.regression.base import BaseProbaRegressor +from skpro.utils.sklearn import prep_skl_df + + +class GammaRegressor(BaseProbaRegressor): + """Gamma regression, direct adapter to sklearn GammaRegressor. + + Generalized Linear Model with a Gamma distribution. + This regressor uses the 'log' link function. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the penalty term. Defaults to 1.0. + See the notes for the exact mathematical meaning of this + parameter. alpha = 0 is equivalent to unpenalized GLMs. + + fit_intercept : bool, default=True + Whether to fit an intercept term. + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + Tolerance for the stopping criteria of the optimization solver. + + verbose : int, default=0 + For the 'sag' and 'lbfgs' solvers set verbose to any positive + number for verbosity. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + Coefficients of the regression model (mean of distribution) + + intercept_ : float + Independent term in decision function. + + n_iter_ : int + The actual number of iterations before reaching the stopping criterion. + + n_features_in_ : int + Number of features seen during :term:'fit'. + + feature_names_in_ : ndarray of shape (n_features,) + Names of features seen during :term:'fit'. + """ + + _tags = { + "capability:multioutput": False, + "capability:missing": False, + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", + } + + def __init__( + self, + alpha=1.0, + fit_intercept=True, + max_iter=100, + tol=1e-4, + verbose=0, + warm_start=False, + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.tol = tol + self.verbose = verbose + self.warm_start = warm_start + + super().__init__() + + from sklearn.linear_model import GammaRegressor as _GammaRegressor + + skl_estimator = _GammaRegressor( + alpha=alpha, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + warm_start=warm_start, + ) + + self.estimator_ = skl_estimator + + FITTED_PARAMS_TO_FORWARD = [ + "coef_", + "intercept_", + "n_iter_", + ] + + def _fit(self, X, y): + """Fit regressor to training data. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + + Returns + ------- + self : reference to self + """ + X_inner = prep_skl_df(X).to_numpy() + y_inner = prep_skl_df(y).to_numpy() + + self._y_cols = y.columns + + if len(y_inner.shape) > 1 and y_inner.shape[1] == 1: + y_inner = y_inner[:, 0] + + estimator = self.estimator_ + estimator.fit(X=X_inner, y=y_inner) + + for attr in self.FITTED_PARAMS_TO_FORWARD: + setattr(self, attr, getattr(estimator, attr)) + + n_samples, n_features = X_inner.shape + p = n_features + (1 if self.fit_intercept else 0) + + y_pred = estimator.predict(X_inner) + # Guard against zero/underflow predictions when used as a divisor + eps = np.finfo(float).eps + y_pred = np.clip(y_pred, eps, None) + + # Estimate dispersion (phi) using Pearson chi-squared statistic + if n_samples > p: + dispersion = np.sum(((y_inner - y_pred) / y_pred) ** 2) / (n_samples - p) + # Ensure strictly positive dispersion to avoid divide-by-zero later + self.dispersion_ = max(dispersion, eps) + else: + self.dispersion_ = 1.0 + + return self + + def _predict(self, X): + """Predict labels for data from features. + + State required: + Requires state to be "fitted" = self.is_fitted=True + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y : pandas DataFrame, same length as `X`, same columns as `y` in `fit` + labels predicted for `X` + """ + X_inner = prep_skl_df(X).to_numpy() + y_pred = self.estimator_.predict(X_inner) + y_pred_df = pd.DataFrame(y_pred, index=X.index, columns=self._y_cols) + return y_pred_df + + def _predict_var(self, X): + """Compute/return variance predictions.""" + y_pred = self._predict(X) + return self.dispersion_ * (y_pred**2) + + def _predict_proba(self, X): + """Predict distribution over labels for data from features. + + State required: + Requires state to be "fitted". + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y_pred : skpro BaseDistribution, same length as `X` + labels predicted for `X` + """ + from skpro.distributions.gamma import Gamma + + y_cols = self._y_cols + y_pred = self.predict(X).values + + # Gamma distribution parameters: shape (alpha) and rate (beta) + alpha = 1.0 / self.dispersion_ + beta = 1.0 / (self.dispersion_ * y_pred) + + y_pred_proba = Gamma(alpha=alpha, beta=beta, index=X.index, columns=y_cols) + return y_pred_proba + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + param1 = {} + param2 = { + "alpha": 2.0, + "fit_intercept": False, + "max_iter": 200, + "tol": 2e-4, + "verbose": 1, + "warm_start": True, + } + return [param1, param2]