This _is_ frowned upon behaviour (storing arrays inside DataFrames) but is there a reason for this raise?
Deleting the raising lines seems to only break tests to check that they're raising...
df = pd.DataFrame([[1,np.array([10,20,30])],
[1,np.array([40,50,60])],
[2,np.array([20,30,40])],], columns=['category','arraydata'])
g = df.groupby('category')
g.agg(sum)
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-34-527a2010b455> in <module>()
----> 1 g.agg(sum)
/Users/andy/pandas/pandas/core/groupby.py in agg(self, func, *args, **kwargs)
337 @Appender(_agg_doc)
338 def agg(self, func, *args, **kwargs):
--> 339 return self.aggregate(func, *args, **kwargs)
340
341 def _iterate_slices(self):
/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
1740 cyfunc = _intercept_cython(arg)
1741 if cyfunc and not args and not kwargs:
-> 1742 return getattr(self, cyfunc)()
1743
1744 if self.grouper.nkeys > 1:
/Users/andy/pandas/pandas/core/groupby.py in f(self)
62 raise SpecificationError(str(e))
63 except Exception:
---> 64 result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
65 if _convert:
66 result = result.convert_objects()
/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
1745 return self._python_agg_general(arg, *args, **kwargs)
1746 else:
-> 1747 result = self._aggregate_generic(arg, *args, **kwargs)
1748
1749 if not self.as_index:
/Users/andy/pandas/pandas/core/groupby.py in _aggregate_generic(self, func, *args, **kwargs)
1803 result[name] = self._try_cast(func(data, *args, **kwargs),data)
1804 except Exception:
-> 1805 return self._aggregate_item_by_item(func, *args, **kwargs)
1806 else:
1807 for name in self.indices:
/Users/andy/pandas/pandas/core/groupby.py in _aggregate_item_by_item(self, func, *args, **kwargs)
1828 colg = SeriesGroupBy(obj[item], selection=item,
1829 grouper=self.grouper)
-> 1830 result[item] = colg.aggregate(func, *args, **kwargs)
1831 except ValueError:
1832 cannot_agg.append(item)
/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
1425 return self._python_agg_general(func_or_funcs, *args, **kwargs)
1426 except Exception:
-> 1427 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
1428
1429 index = Index(sorted(result), name=self.grouper.names[0])
/Users/andy/pandas/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
1509 output = func(group, *args, **kwargs)
1510 if isinstance(output, np.ndarray):
-> 1511 raise Exception('Must produce aggregated value')
1512 result[name] = self._try_cast(output, group)
1513
Exception: Must produce aggregated value
http://stackoverflow.com/questions/16975318/pandas-aggregate-when-column-contains-numpy-arrays
Instead of the long traceback, let's be more explicit. Why does _aggregate_named in pandas.core.groupby check for ndarray? Is there a reason behind it?
def _aggregate_named(self, func, *args, **kwargs):
result = {}
for name, group in self:
group.name = name
output = func(group, *args, **kwargs)
if isinstance(output, np.ndarray):
raise Exception('Must produce aggregated value')
result[name] = self._try_cast(output, group)
return result
My impression is that this happens because the result is unclear / not well-defined. Take the test case that expects an error from test_groupby.TestGroupBy.test_basic:
data = Series(np.arange(9) // 3, index=np.arange(9))
grouped = data.groupby(lambda x: x // 3)
grouped.aggregate(lambda x: x * 3)
Should this return a dataframe with 3 columns? transform the column to an array of ndarray? If it should return an ndarray, you lose all of the ability to do fast operations on it, since it changes its dtype to object, etc.
It seems no less well-defined than passing lists.
Agreed that you lose all the fast operations, but if that's the only reason shouldn't this be left up to the user...?
the ndarray test is testing whether this is a reduction; you are right it is not needed
If the function produces a Series, then the result of the groupby operation would be a DataFrame
you can't do this with an ndarray wbecause it can't align; I agree its sort of a degenerate case,
prob should just create a Series and thus return a Frame...
@hayd do you have an example where this would be a better choice than a different setup (Panel, etc.)? SO question user realized that they were going down the wrong path.
How should we handle this case, where you end up with a DataFrame with arrays in it? Create stacked columns?
data = DataFrame({"A": np.arange(9) // 3, "B": np.arange(9)}, index=np.arange(9))
grouped = data.groupby(lambda x: x.A // 3)
grouped.aggregate({"A": lambda x: x * 3, "B": lambda x: x[0] * 3})
Could we just give a performance warning ? This seems easiest.
_Then we don't need to worry about... lots of things in these weird corner cases._
That makes a lot of sense. I'll write up a PR tonight.
On Fri, Jun 7, 2013 at 9:58 AM, Andy Hayden [email protected]:
The easiest solution is to just giving a performance warning ?
_Then we don't need to worry about... lots of things in these weird
corner cases._—
Reply to this email directly or view it on GitHubhttps://github.com/pydata/pandas/issues/3788#issuecomment-19108370
.
@jtratner
fyi...there is a PerformanceWarning Warning I defined in io/pytables.py
why don't you move that to core/common.py ?
(need import then in io/pytables.py and io/tests/test_pytables.py)
let's revisit in 0.14
This seems to "just work" in master. Could add a test case...
Hi, I still have this problem using pandas 0.18.0, when I'm generating a ndarray as a vector.
why not allow ndarray as aggregated value?
@Earthson as indicated if you are trying to return ndarrays then you are using pandas in a non-performant / non-supported way.
this works in current master. anyone want to add a test?
This seems only partially fixed. Taking the example from the test case
df = pd.DataFrame([[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])]],
columns=['category', 'arraydata'])
The following will work (thats the test case):
result = df.groupby('category').agg(sum)
But this will fail:
result = df.groupby('category')["arraydata"].agg(sum)
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in f(self, **kwargs)
1153 except Exception:
1154 result = self.aggregate(
-> 1155 lambda x: npfunc(x, axis=self.axis))
1156 if _convert:
1157 result = result._convert(datetime=True)
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
-> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
2886
2887 index = Index(sorted(result), name=self.grouper.names[0])
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
3015 output = func(group, *args, **kwargs)
3016 if isinstance(output, (Series, Index, np.ndarray)):
-> 3017 raise Exception('Must produce aggregated value')
3018 result[name] = self._try_cast(output, group)
3019
Exception: Must produce aggregated value
Or in a similar case:
----> 1 g["mean"].agg(lambda x: np.mean(x))
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2878
2879 if self.grouper.nkeys > 1:
-> 2880 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2881
2882 try:
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
846 for name, obj in self._iterate_slices():
847 try:
--> 848 result, counts = self.grouper.agg_series(obj, f)
849 output[name] = self._try_cast(result, obj, numeric_only=True)
850 except TypeError:
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2178 return self._aggregate_series_fast(obj, func)
2179 except Exception:
-> 2180 return self._aggregate_series_pure_python(obj, func)
2181
2182 def _aggregate_series_fast(self, obj, func):
/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
2213 if (isinstance(res, (Series, Index, np.ndarray)) or
2214 isinstance(res, list)):
-> 2215 raise ValueError('Function does not reduce')
2216 result = np.empty(ngroups, dtype='O')
2217
ValueError: Function does not reduce
Most helpful comment
This seems only partially fixed. Taking the example from the test case
The following will work (thats the test case):
result = df.groupby('category').agg(sum)But this will fail:
result = df.groupby('category')["arraydata"].agg(sum)Or in a similar case: