[BUG] `Groupby` operations should fail on un-supported types instead of passing silently
galipremsagar opened this issue · comments
GALI PREM SAGAR commented
Describe the bug
We seem to be passing Groupby operations when there is an unsupported operation for a type.
Steps/Code to reproduce bug
In [1]: import cudf
In [2]: gdf = cudf.DataFrame(
...: {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
...: )
In [3]: gdf.groupby("a").agg(["count", "mean"])
Out[3]:
b c
count mean count
a
1 2 1.5 2
2 2 3.5 2
In [4]: gdf.to_pandas().groupby("a").agg(["count", "mean"])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1942, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
1941 try:
-> 1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)
1943 except Exception as err:
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/ops.py:864, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
862 preserve_dtype = True
--> 864 result = self._aggregate_series_pure_python(obj, func)
866 npvalues = lib.maybe_convert_objects(result, try_float=False)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/ops.py:885, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
884 for i, group in enumerate(splitter):
--> 885 res = func(group)
886 res = extract_result(res)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2454, in GroupBy.mean.<locals>.<lambda>(x)
2451 else:
2452 result = self._cython_agg_general(
2453 "mean",
-> 2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/series.py:6549, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
6541 @doc(make_doc("mean", ndim=1))
6542 def mean(
6543 self,
(...)
6547 **kwargs,
6548 ):
-> 6549 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/generic.py:12420, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
12413 def mean(
12414 self,
12415 axis: Axis | None = 0,
(...)
12418 **kwargs,
12419 ) -> Series | float:
> 12420 return self._stat_function(
12421 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
12422 )
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/generic.py:12377, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
12375 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 12377 return self._reduce(
12378 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
12379 )
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/series.py:6457, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
6453 raise TypeError(
6454 f"Series.{name} does not allow {kwd_name}={numeric_only} "
6455 "with non-numeric dtypes."
6456 )
-> 6457 return op(delegate, skipna=skipna, **kwds)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
146 else:
--> 147 result = alt(values, axis=axis, skipna=skipna, **kwds)
149 return result
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
402 mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
406 if datetimelike:
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
722 if axis is not None and getattr(the_sum, "ndim", False):
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/nanops.py:1701, in _ensure_numeric(x)
1699 if isinstance(x, str):
1700 # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1701 raise TypeError(f"Could not convert string '{x}' to numeric")
1702 try:
TypeError: Could not convert string 'ab' to numeric
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[4], line 1
----> 1 gdf.to_pandas().groupby("a").agg(["count", "mean"])
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:1432, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
1429 kwargs["engine_kwargs"] = engine_kwargs
1431 op = GroupByApply(self, func, args=args, kwargs=kwargs)
-> 1432 result = op.agg()
1433 if not is_dict_like(func) and result is not None:
1434 # GH #52849
1435 if not self.as_index and is_list_like(func):
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:193, in Apply.agg(self)
190 return self.agg_dict_like()
191 elif is_list_like(func):
192 # we require a list, but not a 'str'
--> 193 return self.agg_list_like()
195 if callable(func):
196 f = com.get_cython_func(func)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:326, in Apply.agg_list_like(self)
318 def agg_list_like(self) -> DataFrame | Series:
319 """
320 Compute aggregation in the case of a list-like argument.
321
(...)
324 Result of aggregation.
325 """
--> 326 return self.agg_or_apply_list_like(op_name="agg")
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:1571, in GroupByApply.agg_or_apply_list_like(self, op_name)
1566 # Only set as_index=True on groupby objects, not Window or Resample
1567 # that inherit from this class.
1568 with com.temp_setattr(
1569 obj, "as_index", True, condition=hasattr(obj, "as_index")
1570 ):
-> 1571 keys, results = self.compute_list_like(op_name, selected_obj, kwargs)
1572 result = self.wrap_results_list_like(keys, results)
1573 return result
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/apply.py:385, in Apply.compute_list_like(self, op_name, selected_obj, kwargs)
379 colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
380 args = (
381 [self.axis, *self.args]
382 if include_axis(op_name, colg)
383 else self.args
384 )
--> 385 new_res = getattr(colg, op_name)(func, *args, **kwargs)
386 results.append(new_res)
387 indices.append(index)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:257, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
255 kwargs["engine"] = engine
256 kwargs["engine_kwargs"] = engine_kwargs
--> 257 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
258 if relabeling:
259 # columns is not narrowed by mypy from relabeling flag
260 assert columns is not None # for mypy
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:362, in SeriesGroupBy._aggregate_multiple_funcs(self, arg, *args, **kwargs)
360 for idx, (name, func) in enumerate(arg):
361 key = base.OutputKey(label=name, position=idx)
--> 362 results[key] = self.aggregate(func, *args, **kwargs)
364 if any(isinstance(x, DataFrame) for x in results.values()):
365 from pandas import concat
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/generic.py:249, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
247 if engine_kwargs is not None:
248 kwargs["engine_kwargs"] = engine_kwargs
--> 249 return getattr(self, func)(*args, **kwargs)
251 elif isinstance(func, abc.Iterable):
252 # Catch instances of lists / tuples
253 # but not the class list / tuple itself.
254 func = maybe_mangle_lambdas(func)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2452, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
2445 return self._numba_agg_general(
2446 grouped_mean,
2447 executor.float_dtype_mapping,
2448 engine_kwargs,
2449 min_periods=0,
2450 )
2451 else:
-> 2452 result = self._cython_agg_general(
2453 "mean",
2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result
-> 1998 new_mgr = data.grouped_reduce(array_func)
1999 res = self._wrap_agged_manager(new_mgr)
2000 if how in ["idxmin", "idxmax"]:
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/internals/base.py:367, in SingleDataManager.grouped_reduce(self, func)
365 def grouped_reduce(self, func):
366 arr = self.array
--> 367 res = func(arr)
368 index = default_index(len(res))
370 mgr = type(self).from_array(res, index)
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1995, in GroupBy._cython_agg_general.<locals>.array_func(values)
1992 return result
1994 assert alt is not None
-> 1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result
File /nvme/0/pgali/envs/cudfdev/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1946, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
1945 # preserve the kind of exception that raised
-> 1946 raise type(err)(msg) from err
1948 if ser.dtype == object:
1949 res_values = res_values.astype(object, copy=False)
TypeError: agg function failed [how->mean,dtype->object]
Expected behavior
cudf should fail like pandas.