rapidsai / cudf

cuDF - GPU DataFrame Library

Home Page:https://docs.rapids.ai/api/cudf/stable/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

[BUG] `Groupby` operations should fail on un-supported types instead of passing silently

galipremsagar opened this issue · comments

Describe the bug
We seem to be passing Groupby operations when there is an unsupported operation for a type.

Steps/Code to reproduce bug

In [1]: import cudf

In [2]: gdf = cudf.DataFrame(
   ...:         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
   ...:     )

In [3]: gdf.groupby("a").agg(["count", "mean"])
Out[3]: 
      b          c
  count mean count
a                 
1     2  1.5     2
2     2  3.5     2

In [4]: gdf.to_pandas().groupby("a").agg(["count", "mean"])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1942, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1941 try:
-> 1942     res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)
   1943 except Exception as err:

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/ops.py:864, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    862     preserve_dtype = True
--> 864 result = self._aggregate_series_pure_python(obj, func)
    866 npvalues = lib.maybe_convert_objects(result, try_float=False)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/ops.py:885, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    884 for i, group in enumerate(splitter):
--> 885     res = func(group)
    886     res = extract_result(res)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2454, in GroupBy.mean.<locals>.<lambda>(x)
   2451 else:
   2452     result = self._cython_agg_general(
   2453         "mean",
-> 2454         alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
   2455         numeric_only=numeric_only,
   2456     )
   2457     return result.__finalize__(self.obj, method="groupby")

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/series.py:6549, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
   6541 @doc(make_doc("mean", ndim=1))
   6542 def mean(
   6543     self,
   (...)
   6547     **kwargs,
   6548 ):
-> 6549     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/generic.py:12420, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  12413 def mean(
  12414     self,
  12415     axis: Axis | None = 0,
   (...)
  12418     **kwargs,
  12419 ) -> Series | float:
> 12420     return self._stat_function(
  12421         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  12422     )

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/generic.py:12377, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  12375 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 12377 return self._reduce(
  12378     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  12379 )

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/series.py:6457, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6453     raise TypeError(
   6454         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6455         "with non-numeric dtypes."
   6456     )
-> 6457 return op(delegate, skipna=skipna, **kwds)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:1701, in _ensure_numeric(x)
   1699 if isinstance(x, str):
   1700     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1701     raise TypeError(f"Could not convert string '{x}' to numeric")
   1702 try:

TypeError: Could not convert string 'ab' to numeric

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 gdf.to_pandas().groupby("a").agg(["count", "mean"])

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:1432, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
   1429     kwargs["engine_kwargs"] = engine_kwargs
   1431 op = GroupByApply(self, func, args=args, kwargs=kwargs)
-> 1432 result = op.agg()
   1433 if not is_dict_like(func) and result is not None:
   1434     # GH #52849
   1435     if not self.as_index and is_list_like(func):

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:193, in Apply.agg(self)
    190     return self.agg_dict_like()
    191 elif is_list_like(func):
    192     # we require a list, but not a 'str'
--> 193     return self.agg_list_like()
    195 if callable(func):
    196     f = com.get_cython_func(func)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:326, in Apply.agg_list_like(self)
    318 def agg_list_like(self) -> DataFrame | Series:
    319     """
    320     Compute aggregation in the case of a list-like argument.
    321 
   (...)
    324     Result of aggregation.
    325     """
--> 326     return self.agg_or_apply_list_like(op_name="agg")

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:1571, in GroupByApply.agg_or_apply_list_like(self, op_name)
   1566 # Only set as_index=True on groupby objects, not Window or Resample
   1567 # that inherit from this class.
   1568 with com.temp_setattr(
   1569     obj, "as_index", True, condition=hasattr(obj, "as_index")
   1570 ):
-> 1571     keys, results = self.compute_list_like(op_name, selected_obj, kwargs)
   1572 result = self.wrap_results_list_like(keys, results)
   1573 return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:385, in Apply.compute_list_like(self, op_name, selected_obj, kwargs)
    379 colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
    380 args = (
    381     [self.axis, *self.args]
    382     if include_axis(op_name, colg)
    383     else self.args
    384 )
--> 385 new_res = getattr(colg, op_name)(func, *args, **kwargs)
    386 results.append(new_res)
    387 indices.append(index)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:257, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    255 kwargs["engine"] = engine
    256 kwargs["engine_kwargs"] = engine_kwargs
--> 257 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
    258 if relabeling:
    259     # columns is not narrowed by mypy from relabeling flag
    260     assert columns is not None  # for mypy

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:362, in SeriesGroupBy._aggregate_multiple_funcs(self, arg, *args, **kwargs)
    360     for idx, (name, func) in enumerate(arg):
    361         key = base.OutputKey(label=name, position=idx)
--> 362         results[key] = self.aggregate(func, *args, **kwargs)
    364 if any(isinstance(x, DataFrame) for x in results.values()):
    365     from pandas import concat

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:249, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    247     if engine_kwargs is not None:
    248         kwargs["engine_kwargs"] = engine_kwargs
--> 249     return getattr(self, func)(*args, **kwargs)
    251 elif isinstance(func, abc.Iterable):
    252     # Catch instances of lists / tuples
    253     # but not the class list / tuple itself.
    254     func = maybe_mangle_lambdas(func)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2452, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   2445     return self._numba_agg_general(
   2446         grouped_mean,
   2447         executor.float_dtype_mapping,
   2448         engine_kwargs,
   2449         min_periods=0,
   2450     )
   2451 else:
-> 2452     result = self._cython_agg_general(
   2453         "mean",
   2454         alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
   2455         numeric_only=numeric_only,
   2456     )
   2457     return result.__finalize__(self.obj, method="groupby")

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1995     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1996     return result
-> 1998 new_mgr = data.grouped_reduce(array_func)
   1999 res = self._wrap_agged_manager(new_mgr)
   2000 if how in ["idxmin", "idxmax"]:

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/internals/base.py:367, in SingleDataManager.grouped_reduce(self, func)
    365 def grouped_reduce(self, func):
    366     arr = self.array
--> 367     res = func(arr)
    368     index = default_index(len(res))
    370     mgr = type(self).from_array(res, index)

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1995, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1992     return result
   1994 assert alt is not None
-> 1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1996 return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1946, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1944     msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
   1945     # preserve the kind of exception that raised
-> 1946     raise type(err)(msg) from err
   1948 if ser.dtype == object:
   1949     res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]

Expected behavior
cudf should fail like pandas.