Predictions underfit

Question

Predictions underfit

VHolstein opened this issue a year ago · comments

I'm currently using your package inside of another python package (photonai) to build a model pipeline. This is done to run a number of different models through the same pipeline. I have integrated your GPBoostRegressor as follows below. For testing purposes I'm training this model on a simple dataframe where X is a vector of numbers and y is the sum of these numbers. I was expecting a mean of absolute error of around 0, but this is not the case. Any suggestions what I might be doing wrong when integrating your model like this?

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GroupKFold, KFold
from photonai.base import Hyperpipe, PipelineElement
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import gpboost as gpb


class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # TODO: Sklearn example: https://github.com/fabsig/GPBoost/blob/master/examples/python-guide/sklearn_example.py
        # TODO: python guide: https://github.com/fabsig/GPBoost/blob/master/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "cluster" and "Z" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        clst = pd.Series(kwargs["clusters"])
        preds = self.gpboost.predict(X, group_data_pred=clst)
        preds = preds["response_mean"]
        return preds

    def save(self):
        return None


def get_gpboost_pipe(pipe_name, project_folder, split="group"):

    if split == "group":
        outercv = GroupKFold(n_splits=10)
    else:
        outercv = KFold(n_splits=10)

    my_pipe = Hyperpipe(pipe_name,
                        optimizer='grid_search',
                        metrics=['mean_absolute_error', 'mean_squared_error',
                                 'spearman_correlation', 'pearson_correlation'],
                        best_config_metric='mean_absolute_error',
                        outer_cv=outercv,
                        inner_cv=KFold(n_splits=10),
                        calculate_metrics_across_folds=True,
                        use_test_set=True,
                        verbosity=1,
                        project_folder=project_folder)

    # Add transformer elements
    my_pipe += PipelineElement("StandardScaler", hyperparameters={},
                               test_disabled=True, with_mean=True, with_std=True)

    my_pipe += PipelineElement.create("GPBoost", GPBoostDataWrapper(), hyperparameters={})

    return my_pipe

Fabio Sigrist · Answer 1 · Sun Aug 13 2023 15:24:32 GMT+0800 (China Standard Time)

Thanks for using GPBoost!

Can you please provide a reproducible example?

Vincent Holstein · Answer 2 · Mon Aug 14 2023 03:41:00 GMT+0800 (China Standard Time)

Yes sure! Sorry for not providing this earlier. I made a mock example, very similar to the data I was using to debug my code. The model produces a mean absolute error quite similar to the Dummy Estimator (which just guesses the mean for every sample). In theory the model should be able to esimate the target accurately as it is just the sum of all the values in a row. I'm sure I'm making an error in the way I use your code, I just have not been able to figure it out.

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GroupKFold, KFold
from photonai.base import Hyperpipe, PipelineElement
import numpy as np
import pandas as pd
import gpboost as gpb


class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        clst = pd.Series(kwargs["clusters"])
        preds = self.gpboost.predict(X, group_data_pred=clst)
        preds = preds["response_mean"]
        return preds

    def save(self):
        return None


def get_gpboost_pipe(pipe_name, project_folder, split="group"):

    if split == "group":
        outercv = GroupKFold(n_splits=10)
    else:
        outercv = KFold(n_splits=10)

    my_pipe = Hyperpipe(pipe_name,
                        optimizer='grid_search',
                        metrics=['mean_absolute_error', 'mean_squared_error',
                                 'spearman_correlation', 'pearson_correlation'],
                        best_config_metric='mean_absolute_error',
                        outer_cv=outercv,
                        inner_cv=KFold(n_splits=10),
                        calculate_metrics_across_folds=True,
                        use_test_set=True,
                        verbosity=1,
                        project_folder=project_folder)

    # Add transformer elements
    my_pipe += PipelineElement("StandardScaler", hyperparameters={},
                               test_disabled=True, with_mean=True, with_std=True)

    my_pipe += PipelineElement.create("GPBoost", GPBoostDataWrapper(), hyperparameters={})

    return my_pipe


def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst


X, y, clst = get_mock_data()

# define project folder
project_folder = "/tmp/gpboost_debug"

my_pipe = get_gpboost_pipe("Test_gpboost", project_folder, split="random")
my_pipe.fit(X, y, clusters=clst)

Fabio Sigrist · Answer 3 · Tue Aug 15 2023 22:09:11 GMT+0800 (China Standard Time)

I get an error AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric' when trying to run your code (photonai version 2.4.0).

In addition, can you please also provide code that shows you calculate your mean absolute error and compare it to a "dummy estimator".

Vincent Holstein · Answer 4 · Thu Aug 17 2023 05:07:06 GMT+0800 (China Standard Time)

I just updated to photonai version 2.4.0 and it still works with the code above. Previously I was running it with version 2.3.0 - I also checked the source code for the Hyperpipe, and it seems to have a best_config_metric attribute (https://github.com/wwu-mmll/photonai/blob/main/photonai/base/hyperpipe.py; line 282). Maybe testing it with version 2.3.0 will work better?

The photonai package calculates the mean absolute error and runs the dummy estimator. It's automatically done by the Hyperpipe. You select your metrics and best_config_metric - if these are regression metrics the dummy estimator will guess the mean, if these are classification metrics the dummy estimator will guess the most common value.

Fabio Sigrist · Answer 5 · Thu Aug 17 2023 14:12:15 GMT+0800 (China Standard Time)

I am still getting the same error with photonai version 2.3.0.

Traceback (most recent call last):

  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\formatters.py", line 345, in __call__
    return method()

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 625, in _repr_html_inner
    return estimator_html_repr(self)

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 417, in estimator_html_repr
    _write_estimator_html(

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 151, in _write_estimator_html
    est_block = _get_visual_block(estimator)

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\_estimator_html_repr.py", line 127, in _get_visual_block
    for key, est in estimator.get_params(deep=False).items()

  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 211, in get_params
    value = getattr(self, key)

AttributeError: 'Hyperpipe' object has no attribute 'best_config_metric'

Out[1]: Hyperpipe(name='Test_gpboost')

Ramona Leenings · Answer 6 · Mon Aug 28 2023 16:30:07 GMT+0800 (China Standard Time)

Dear fabsig,
the error you are experiencing is because apparently you are using a Jupyter Notebook.
The fit method returns self, aka a hyperpipe object, and it tries to print it.
Please change the last line of code to :
_ = my_pipe.fit(X, y, clusters=clst)

Vincent Holstein · Answer 7 · Wed Aug 30 2023 22:52:48 GMT+0800 (China Standard Time)

Dear fabsig, I have adapted the code to test it out of the photonai package and it still poses the same error. Maybe I'm using the GPBoost algorithm the wrong way?

from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import pandas as pd
import gpboost as gpb


class GPBoostDataWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.needs_covariates = True
        # self.gpmodel = gpb.GPModel(likelihood="gaussian")
        self.gpboost = gpb.GPBoostRegressor()

    def fit(self, X, y, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            gpmodel = gpb.GPModel(likelihood="gaussian", group_data=clst)
            self.gpboost.fit(X, y, gp_model=gpmodel)
        else:
            raise NotImplementedError("GPBoost needs clusters")
        return self

    def predict(self, X, **kwargs):
        if "clusters" in kwargs:
            clst = pd.Series(kwargs["clusters"])
            preds = self.gpboost.predict(X, group_data_pred=clst)
            preds = preds["response_mean"]
            return preds
        else:
            raise NotImplementedError("GPBoost needs clusters")

    def save(self):
        return None


def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst


X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()

model = GPBoostDataWrapper()

model.fit(X_train, y_train, clusters=clst_train)

y_pred = model.predict(X_test, clusters=clst_test)

print(np.sum(np.absolute((y_test - y_pred))))

Vincent Holstein · Answer 8 · Wed Aug 30 2023 23:30:45 GMT+0800 (China Standard Time)

I've also added an even simpler test case that illustrates the same problem.

import numpy as np
import pandas as pd
import gpboost as gpb


def get_mock_data():

    X = np.random.randint(10, size=(200, 9))
    y = np.sum(X, axis=1)
    clst = np.random.randint(10, size=200)

    return X, y, clst

# get train/test data
X_train, y_train, clst_train = get_mock_data()
X_test, y_test, clst_test = get_mock_data()

gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
bst = gpb.GPBoostRegressor()
bst.fit(X_train, y_train, gp_model=gp_model)

preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])) / len(y))

Fabio Sigrist · Answer 9 · Mon Sep 04 2023 19:41:42 GMT+0800 (China Standard Time)

Thanks for the example.

The reason is that you do not choose tuning parameters, but use default values. You must always appropriately choose tuning parameters for every data set when applying boosting. There are no universal "default values". The script below shows how this can be done in your example. I obtain a much smaller MAE. Note that in this example, you simulate from a linear model with a sample size of only n=200. It is thus expected that a linear model performs better than a tree-boosting model (the difference will become smaller for larger sample sizes when the true model linear).

gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")

# Choose tuning parameters
param_grid = {'learning_rate': [1,0.1,0.01], 
              'min_data_in_leaf': [10,100,1000],
              'max_depth': [1,2,3,5,10],
              'lambda_l2': [0,1,10]}
other_params = {'num_leaves': 2**10, 'verbose': 0}
gp_model = gpb.GPModel(group_data=clst_train, likelihood="gaussian")
data_train = gpb.Dataset(data=X_train, label=y_train)
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
                                             num_try_random=None, nfold=4, seed=1000,
                                             train_set=data_train, gp_model=gp_model,
                                             use_gp_model_for_validation=True, verbose_eval=1,
                                             num_boost_round=1000, early_stopping_rounds=10,
                                             metric = "mae") 
# Best parameters:                                   
# ***** New best test score (l1 = 0.9031159556358567) found for the following parameter combination:
# {'learning_rate': 1.0, 'min_data_in_leaf': 10, 'max_depth': 1, 'lambda_l2': 1, 'num_boost_round': 642}

# Training and prediction
bst = gpb.GPBoostRegressor(n_estimators=642, learning_rate=1, num_leaves=2**10,
                           max_depth=1, min_data_in_leaf=10, reg_lambda=1)
bst.fit(X_train, y_train, gp_model=gp_model)

preds = bst.predict(X_test, group_data_pred=clst_test)
print(np.sum(np.absolute(y_test - preds["response_mean"])))
# 189.18370198237892