Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.
Code Sample, a copy-pastable example
df = pd.DataFrame(
[
(1, "a", "f"),
(2, "b", "d"),
(3, None, "f"),
(4, "a", None),
(5, "a", "f"),
(6, None, "d"),
(7, None, "d"),
(8, "b", None),
(9, "a", "f"),
(10, None, None),
(11, None, "f"),
(12, None, "d"),
],
columns=["item", "att1", "att2"],
)
df2 = pd.DataFrame(
[
(1, "a", "f"),
(2, "b", "d"),
(3, None, "f"),]
, columns=["item", "att1", "att2"]
)
def count_not_null(series: pd.Series) -> int:
return series.notnull().astype(int).sum()
def count_all(series: pd.Series) -> int:
""" count all the values (regardless if they are null or nan) """
return len(series)
df.agg("count")
# item 12
# att1 6
# att2 9
# dtype: int64
df.agg(["count", ])
# item att1 att2
# count 12 6 9
df.agg(count_all)
# item 12
# att1 12
# att2 12
# dtype: int64
df.agg([count_all,])
# item att1 att2
# count_all 12 12 12
df.query("item==1").agg(["count",]) # works fine
# item att1 att2
# count 1 1 1
# All seems to work well however same aggregation gives weird results on the other dataframe
df2.aggregate([count_all,])
# item att1 att2
# item att1 count_all
# count_all 3.0 3.0 NaN
# 0 NaN NaN 1.0
# 1 NaN NaN 1.0
# 2 NaN NaN 1.0
# I get the same issues on the other df when I filter some column for exampe
df.query("item==1").agg([count_all,]) # **weird result** ->
# item att1 att2
# item count_all count_all
# count_all 1.0 NaN NaN
# 0 NaN 1.0 1.0
# And it fails if I use more than one custom aggregations
df.query("item==1").agg(["sum", "count"]) # works with standard aggregations
# item att1 att2
# sum 1 a f
# count 1 1 1
# fails with custom aggregations
df.query("item==1").agg([count_all, count_not_null])
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
553 result = Series(results, index=keys, name=self.name)
554 if is_nested_object(result):
--> 555 raise ValueError("cannot combine transform and aggregation operations")
556 return result
557
ValueError: cannot combine transform and aggregation operations
# df[df["item"]==1].agg([count_all, count_not_null]) also fails
# It works well if I don't use any filtering via query
df.agg([count_all, count_not_null])
# item att1 att2
# count_all 12 12 12
# count_not_null 12 6 9
# It also fails without filtering on the second dataframe
df2.agg([count_all, count_not_null])
---> 1 df2.agg([count_all, count_not_null])
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
6704 result = None
6705 try:
-> 6706 result, how = self._aggregate(func, axis=axis, *args, **kwargs)
6707 except TypeError:
6708 pass
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/frame.py in _aggregate(self, arg, axis, *args, **kwargs)
6718 result = result.T if result is not None else result
6719 return result, how
-> 6720 return super()._aggregate(arg, *args, **kwargs)
6721
6722 agg = aggregate
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
475 elif is_list_like(arg):
476 # we require a list, but not an 'str'
--> 477 return self._aggregate_multiple_funcs(arg, _axis=_axis), None
478 else:
479 result = None
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
521 colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
522 try:
--> 523 new_res = colg.aggregate(arg)
524 except (TypeError, DataError):
525 pass
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/series.py in aggregate(self, func, axis, *args, **kwargs)
3686 # Validate the axis parameter
3687 self._get_axis_number(axis)
-> 3688 result, how = self._aggregate(func, *args, **kwargs)
3689 if result is None:
3690
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
475 elif is_list_like(arg):
476 # we require a list, but not an 'str'
--> 477 return self._aggregate_multiple_funcs(arg, _axis=_axis), None
478 else:
479 result = None
/usr/local/anaconda3/envs/hooqu/lib/python3.8/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
553 result = Series(results, index=keys, name=self.name)
554 if is_nested_object(result):
--> 555 raise ValueError("cannot combine transform and aggregation operations")
556 return result
557
Problem description
I would have expected the output of a custom aggregation upon filtering to be very similar to the one standard ones. Furthermore there seems to be a small bug when passing a single custom aggregation into a collection to the agg
DataFrame method.
I have narrow down the problem to the call to _aggregate_multiple_funcs
that works differently based on the size of the dataframe and the number of functions.
In particular when executing the aggregation on the columns (series) different column behave differently. Example:
if I defined:
def count_all(series):
print(type(series))
return len(series)
And then called aggregate
on different columns the function is passed different set of paramters:
df2['att1'].agg(count_all)
# <class 'str'>
# <class 'str'>
# <class 'NoneType'>
# <class 'pandas.core.series.Series'>
# and
df2['att2'].agg(count_all)
# <class 'str'>
# <class 'str'>
# <class 'str'>
I don't understand this behaviour. I would expect that both functions receive only the full series data.
Workaround
After reading the code I found this line (not sure if it has to do with the problem):
https://github.com/pandas-dev/pandas/blob/v1.0.3/pandas/core/series.py#L3706
So I reimplemented one of the custom aggregation like this:
def count_all(series):
if not isinstance(series, pd.Series):
raise TypeError
return len(series)
And then all aggregations worked. Was I using a bad implementation of a custom aggregation function? From the docs it was not obvious that the aggregation function is required to check the input type.
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.8.1.final.0
python-bits : 64
OS : Darwin
OS-release : 19.0.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : de_DE.UTF-8
LOCALE : de_DE.UTF-8
pandas : 1.0.3
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 20.0.2
setuptools : 45.2.0.post20200210
Cython : None
pytest : 5.4.1
hypothesis : 5.8.0
sphinx : 2.4.4
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.1
IPython : 7.12.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pytest : 5.4.1
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : None