ValueError: node array from the pickle has an incompatible dtype

ytzfhqs opened this issue · comments

I encountered an error when using SUOD on the Kaggle platform

The process of installing dependent packages is as follows:

!git clone https://github.com/yzhao062/pyod.git
%cd pyod
!pip install .
!pip install -r requirements_ci.txt

I encountered an error while using the SUDO code in the tutorial file:

from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD
from pyod.utils.utility import standardizer
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
contamination = 0.1
n_train = 200
n_test = 100

# Generate sample data
X_train, X_test, y_train, y_test = \

# train SUOD
clf_name = 'SUOD'

# initialized a group of outlier detectors for acceleration
detector_list = [LOF(n_neighbors=15), LOF(n_neighbors=20),
                 LOF(n_neighbors=25), LOF(n_neighbors=35),
                 COPOD(), IForest(n_estimators=100),

# decide the number of parallel process, and the combination method
clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average',

# or to use the default detectors
# clf = SUOD(n_jobs=2, combination='average',
#            verbose=False)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

# visualize the results
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
          y_test_pred, show_figure=True, save_figure=False)

Detailed error details:

ValueError                                Traceback (most recent call last)
Cell In[14], line 44
     41 y_train_scores = clf.decision_scores_  # raw outlier scores
     43 # get the prediction on the test data
---> 44 y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
     45 y_test_scores = clf.decision_function(X_test)  # outlier scores
     47 # evaluate and print the results

File /kaggle/working/pyod/pyod/models/base.py:168, in BaseDetector.predict(self, X, return_confidence)
    147 """Predict if a particular sample is an outlier or not.
    149 Parameters
    164     Only if return_confidence is set to True.
    165 """
    167 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
--> 168 pred_score = self.decision_function(X)
    170 if isinstance(self.contamination, (float, int)):
    171     prediction = (pred_score > self.threshold_).astype('int').ravel()

File /kaggle/working/pyod/pyod/models/suod.py:260, in SUOD.decision_function(self, X)
    257 X = check_array(X)
    259 # initialize the output score
--> 260 predicted_scores = self.model_.decision_function(X)
    262 # standardize the score and combine
    263 predicted_scores = self.score_scalar_.transform(predicted_scores)

File /opt/conda/lib/python3.10/site-packages/suod/models/base.py:460, in SUOD.decision_function(self, X)
    456 # decide whether bps is needed
    457 # it is turned off
    458 if self.bps_flag:
    459     # load the pre-trained cost predictor to forecast the train cost
--> 460     cost_predictor = joblib.load(self.cost_forecast_loc_pred_)
    462     time_cost_pred = cost_forecast_meta(cost_predictor, X,
    463                                         self.base_estimator_names)
    465     n_estimators_list, starts, n_jobs = balanced_scheduling(
    466         time_cost_pred, self.n_estimators, self.n_jobs, self.verbose)

File /opt/conda/lib/python3.10/site-packages/joblib/numpy_pickle.py:658, in load(filename, mmap_mode)
    652             if isinstance(fobj, str):
    653                 # if the returned file object is a string, this means we
    654                 # try to load a pickle file generated with an version of
    655                 # Joblib so we load it with joblib compatibility function.
    656                 return load_compatibility(fobj)
--> 658             obj = _unpickle(fobj, filename, mmap_mode)
    659 return obj

File /opt/conda/lib/python3.10/site-packages/joblib/numpy_pickle.py:577, in _unpickle(fobj, filename, mmap_mode)
    575 obj = None
    576 try:
--> 577     obj = unpickler.load()
    578     if unpickler.compat_mode:
    579         warnings.warn("The file '%s' has been generated with a "
    580                       "joblib version less than 0.10. "
    581                       "Please regenerate this pickle file."
    582                       % filename,
    583                       DeprecationWarning, stacklevel=3)

File /opt/conda/lib/python3.10/pickle.py:1213, in _Unpickler.load(self)
   1211             raise EOFError
   1212         assert isinstance(key, bytes_types)
-> 1213         dispatch[key[0]](self)
   1214 except _Stop as stopinst:
   1215     return stopinst.value

File /opt/conda/lib/python3.10/site-packages/joblib/numpy_pickle.py:402, in NumpyUnpickler.load_build(self)
    394 def load_build(self):
    395     """Called to set the state of a newly created object.
    397     We capture it to replace our place-holder objects, NDArrayWrapper or
    400     NDArrayWrapper is used for backward compatibility with joblib <= 0.9.
    401     """
--> 402     Unpickler.load_build(self)
    404     # For backward compatibility, we support NDArrayWrapper objects.
    405     if isinstance(self.stack[-1], (NDArrayWrapper, NumpyArrayWrapper)):

File /opt/conda/lib/python3.10/pickle.py:1718, in _Unpickler.load_build(self)
   1716 setstate = getattr(inst, "__setstate__", None)
   1717 if setstate is not None:
-> 1718     setstate(state)
   1719     return
   1720 slotstate = None

File sklearn/tree/_tree.pyx:676, in sklearn.tree._tree.Tree.__setstate__()

File sklearn/tree/_tree.pyx:1364, in sklearn.tree._tree._check_node_ndarray()

ValueError: node array from the pickle has an incompatible dtype:
- expected: [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]
- got     : {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}

scikit_learn version is 1.2.2, joblibversion is 1.2.0
I ran other sample code without any issues, but whenever I use SUOD, I will report the above error

I had the same issue. First with scikit_learn version 1.1.3, then I updated to 1.2.2 (just like you), and finally, I updated to 1.3.0.
Updating to 1.3.0 did the trick for me.

I hope that helps.