[Distributed] Adding external variable for distributed

Question

[Distributed] Adding external variable for distributed

iamyihwa opened this issue 9 months ago · comments

What happened + What you expected to happen

fcst = DistributedMLForecast()
fcst.predict(h = 12, X_df = X_sf_test).collect()
-> TypeError: predict() got an unexpected keyword argument 'X_df'
fcst.predict(h = 12, new_df = X_sf_test).collect()
-> PythonException: An exception was thrown from a UDF: 'ValueError: The following columns are missing: ['y']'. Full traceback below:

Neither new_df (https://github.com/Nixtla/mlforecast/blob/main/mlforecast/distributed/forecast.py) nor X_df (https://nixtla.github.io/mlforecast/docs/how-to-guides/exogenous_features.html ) seems to work for putting external variables for the distributed version of the forecasting.

Versions / Dependencies

0.10.0

Reproduction script

#from mlforecast.distributed.models.spark.lgb import SparkLGBMForecast
#models = [SparkLGBMForecast()]
from mlforecast.distributed import DistributedMLForecast
from xgboost.spark import SparkXGBRegressor
from synapse.ml.lightgbm import LightGBMRegressor
from mlforecast.distributed.models.spark.xgb import SparkXGBForecast
models = [ SparkXGBForecast() ] # SparkXGBRegressor()] # ,
fcst = DistributedMLForecast(
models,
freq='D',
lags=[1],
lag_transforms={
1: [expanding_mean]
},
date_features=['dayofweek'],
)
fcst.fit(
Y_sf_train,
static_features=['x', 'y'],
)

fcst.predict(h = 12, X_df = X_sf_test)

Issue Severity

None

José Morales · Answer 1 · Tue Nov 07 2023 00:27:56 GMT+0800 (China Standard Time)

Hey @iamyihwa, thanks for using mlforecast. The 0.10.0 version takes dynamic_dfs which is a list of pandas dataframes, can you try using that?

Yihwa Kim · Answer 2 · Tue Nov 07 2023 01:04:53 GMT+0800 (China Standard Time)

@jmoralez Thanks for your suggestion.

However still getting an error after using dynamic_dfs .
Also one question, which version does documentation correspond to???

2023-11-06 17:02:14,311 INFO XGBoost-PySpark: _fit Running xgboost-2.0.1 on 1 workers with
booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'nthread': 1}
train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2023-11-06 17:02:25,233 INFO XGBoost-PySpark: _fit Finished xgboost training!
TypeError: cannot pickle '_thread.RLock' object

Details of the error are :
/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/mlforecast/utils.py in inner(*args, **kwargs)
162 new_args.append(kwargs.pop(arg_names[i]))
163 new_args.append(kwargs.pop(old_name))
--> 164 return f(*new_args, **kwargs)
165
166 return inner

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/mlforecast/distributed/forecast.py in predict(self, h, dynamic_dfs, before_predict_callback, after_predict_callback, new_df, horizon, new_data)
521 partition_results = self.partition_results
522 schema = self._get_predict_schema()
--> 523 res = fa.transform(
524 partition_results,
525 DistributedMLForecast._predict,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/fugue/workflow/api.py in transform(df, using, schema, params, partition, callback, ignore_errors, persist, as_local, save_path, checkpoint, engine, engine_conf, as_fugue)
140 else:
141 raise
--> 142 tdf = src.transform(
143 using=using,
144 schema=schema,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/fugue/workflow/workflow.py in transform(self, using, schema, params, pre_partition, ignore_errors, callback)
557 if pre_partition is None:
558 pre_partition = self.partition_spec
--> 559 df = self.workflow.transform(
560 self,
561 using=using,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/fugue/workflow/workflow.py in transform(self, using, schema, params, pre_partition, ignore_errors, callback, *dfs)
2036 tf._has_rpc_client = not isinstance(callback, EmptyRPCHandler) # type: ignore
2037 tf.validate_on_compile()
-> 2038 return self.process(
2039 *dfs,
2040 using=RunTransformer,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/fugue/workflow/workflow.py in process(self, using, schema, params, pre_partition, *dfs)
1698 """
1699 _dfs = self._to_dfs(*dfs)
-> 1700 task = Process(
1701 len(_dfs),
1702 processor=using,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/fugue/workflow/_tasks.py in init(self, input_n, processor, schema, params, pre_partition, deterministic, lazy, input_names)
255 ):
256 self._processor = _to_processor(processor, schema)
--> 257 self._processor._params = ParamDict(params)
258 self._processor._partition_spec = PartitionSpec(pre_partition)
259 self._processor.validate_on_compile()

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/triad/collections/dict.py in init(self, data, deep)
175 def init(self, data: Any = None, deep: bool = True):
176 super().init()
--> 177 self.update(data, deep=deep)
178
179 def setitem( # type: ignore

/local_disk0/.ephemeral_nfs/envs/pythonEnv-14049207-8c62-4864-93a7-4bc1fb414ecc/lib/python3.8/site-packages/triad/collections/dict.py in update(self, other, on_dup, deep)
262 for k, v in to_kv_iterable(other):
263 if on_dup == ParamDict.OVERWRITE or k not in self:
--> 264 self[k] = copy.deepcopy(v) if deep else v
265 elif on_dup == ParamDict.THROW:
266 raise KeyError(f"{k} exists in dict")

/usr/lib/python3.8/copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):