Pandas: Aggregating over arrays

Created on 7 Jun 2013  Â·  13Comments  Â·  Source: pandas-dev/pandas

This _is_ frowned upon behaviour (storing arrays inside DataFrames) but is there a reason for this raise?

Deleting the raising lines seems to only break tests to check that they're raising...

df = pd.DataFrame([[1,np.array([10,20,30])],
               [1,np.array([40,50,60])], 
               [2,np.array([20,30,40])],], columns=['category','arraydata'])
g = df.groupby('category')
g.agg(sum)
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-34-527a2010b455> in <module>()
----> 1 g.agg(sum)

/Users/andy/pandas/pandas/core/groupby.py in agg(self, func, *args, **kwargs)
    337     @Appender(_agg_doc)
    338     def agg(self, func, *args, **kwargs):
--> 339         return self.aggregate(func, *args, **kwargs)
    340
    341     def _iterate_slices(self):

/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   1740             cyfunc = _intercept_cython(arg)
   1741             if cyfunc and not args and not kwargs:
-> 1742                 return getattr(self, cyfunc)()
   1743
   1744             if self.grouper.nkeys > 1:

/Users/andy/pandas/pandas/core/groupby.py in f(self)
     62             raise SpecificationError(str(e))
     63         except Exception:
---> 64             result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
     65             if _convert:
     66                 result = result.convert_objects()

/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   1745                 return self._python_agg_general(arg, *args, **kwargs)
   1746             else:
-> 1747                 result = self._aggregate_generic(arg, *args, **kwargs)
   1748
   1749         if not self.as_index:

/Users/andy/pandas/pandas/core/groupby.py in _aggregate_generic(self, func, *args, **kwargs)
   1803                     result[name] = self._try_cast(func(data, *args, **kwargs),data)
   1804             except Exception:
-> 1805                 return self._aggregate_item_by_item(func, *args, **kwargs)
   1806         else:
   1807             for name in self.indices:

/Users/andy/pandas/pandas/core/groupby.py in _aggregate_item_by_item(self, func, *args, **kwargs)
   1828                 colg = SeriesGroupBy(obj[item], selection=item,
   1829                                      grouper=self.grouper)
-> 1830                 result[item] = colg.aggregate(func, *args, **kwargs)
   1831             except ValueError:
   1832                 cannot_agg.append(item)

/Users/andy/pandas/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   1425                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   1426             except Exception:
-> 1427                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
   1428
   1429             index = Index(sorted(result), name=self.grouper.names[0])

/Users/andy/pandas/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
   1509             output = func(group, *args, **kwargs)
   1510             if isinstance(output, np.ndarray):
-> 1511                 raise Exception('Must produce aggregated value')
   1512             result[name] = self._try_cast(output, group)
   1513

Exception: Must produce aggregated value

http://stackoverflow.com/questions/16975318/pandas-aggregate-when-column-contains-numpy-arrays

Groupby Testing

Most helpful comment

This seems only partially fixed. Taking the example from the test case

        df = pd.DataFrame([[1, np.array([10, 20, 30])],
                           [1, np.array([40, 50, 60])],
                           [2, np.array([20, 30, 40])]],
                          columns=['category', 'arraydata'])

The following will work (thats the test case):
result = df.groupby('category').agg(sum)

But this will fail:
result = df.groupby('category')["arraydata"].agg(sum)

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in f(self, **kwargs)
   1153                 except Exception:
   1154                     result = self.aggregate(
-> 1155                         lambda x: npfunc(x, axis=self.axis))
   1156                     if _convert:
   1157                         result = result._convert(datetime=True)

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2884             except Exception:
-> 2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
   2886 
   2887             index = Index(sorted(result), name=self.grouper.names[0])

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
   3015             output = func(group, *args, **kwargs)
   3016             if isinstance(output, (Series, Index, np.ndarray)):
-> 3017                 raise Exception('Must produce aggregated value')
   3018             result[name] = self._try_cast(output, group)
   3019 

Exception: Must produce aggregated value

Or in a similar case:

----> 1 g["mean"].agg(lambda x: np.mean(x))

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2878 
   2879             if self.grouper.nkeys > 1:
-> 2880                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2881 
   2882             try:

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
    846         for name, obj in self._iterate_slices():
    847             try:
--> 848                 result, counts = self.grouper.agg_series(obj, f)
    849                 output[name] = self._try_cast(result, obj, numeric_only=True)
    850             except TypeError:

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
   2178             return self._aggregate_series_fast(obj, func)
   2179         except Exception:
-> 2180             return self._aggregate_series_pure_python(obj, func)
   2181 
   2182     def _aggregate_series_fast(self, obj, func):

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
   2213                 if (isinstance(res, (Series, Index, np.ndarray)) or
   2214                         isinstance(res, list)):
-> 2215                     raise ValueError('Function does not reduce')
   2216                 result = np.empty(ngroups, dtype='O')
   2217 

ValueError: Function does not reduce

All 13 comments

Instead of the long traceback, let's be more explicit. Why does _aggregate_named in pandas.core.groupby check for ndarray? Is there a reason behind it?

def _aggregate_named(self, func, *args, **kwargs):
    result = {}

    for name, group in self:
        group.name = name
        output = func(group, *args, **kwargs)
        if isinstance(output, np.ndarray):
            raise Exception('Must produce aggregated value')
        result[name] = self._try_cast(output, group)

    return result

My impression is that this happens because the result is unclear / not well-defined. Take the test case that expects an error from test_groupby.TestGroupBy.test_basic:

data = Series(np.arange(9) // 3, index=np.arange(9))
grouped = data.groupby(lambda x: x // 3)
grouped.aggregate(lambda x: x * 3)

Should this return a dataframe with 3 columns? transform the column to an array of ndarray? If it should return an ndarray, you lose all of the ability to do fast operations on it, since it changes its dtype to object, etc.

It seems no less well-defined than passing lists.

Agreed that you lose all the fast operations, but if that's the only reason shouldn't this be left up to the user...?

the ndarray test is testing whether this is a reduction; you are right it is not needed

If the function produces a Series, then the result of the groupby operation would be a DataFrame

you can't do this with an ndarray wbecause it can't align; I agree its sort of a degenerate case,
prob should just create a Series and thus return a Frame...

@hayd do you have an example where this would be a better choice than a different setup (Panel, etc.)? SO question user realized that they were going down the wrong path.

How should we handle this case, where you end up with a DataFrame with arrays in it? Create stacked columns?

data = DataFrame({"A": np.arange(9) // 3, "B": np.arange(9)}, index=np.arange(9))
grouped = data.groupby(lambda x: x.A // 3)
grouped.aggregate({"A": lambda x: x * 3, "B": lambda x: x[0] * 3})

Could we just give a performance warning ? This seems easiest.

_Then we don't need to worry about... lots of things in these weird corner cases._

That makes a lot of sense. I'll write up a PR tonight.

On Fri, Jun 7, 2013 at 9:58 AM, Andy Hayden [email protected]:

The easiest solution is to just giving a performance warning ?

_Then we don't need to worry about... lots of things in these weird
corner cases._

—
Reply to this email directly or view it on GitHubhttps://github.com/pydata/pandas/issues/3788#issuecomment-19108370
.

@jtratner

fyi...there is a PerformanceWarning Warning I defined in io/pytables.py
why don't you move that to core/common.py ?
(need import then in io/pytables.py and io/tests/test_pytables.py)

let's revisit in 0.14

This seems to "just work" in master. Could add a test case...

Hi, I still have this problem using pandas 0.18.0, when I'm generating a ndarray as a vector.

why not allow ndarray as aggregated value?

@Earthson as indicated if you are trying to return ndarrays then you are using pandas in a non-performant / non-supported way.

this works in current master. anyone want to add a test?

This seems only partially fixed. Taking the example from the test case

        df = pd.DataFrame([[1, np.array([10, 20, 30])],
                           [1, np.array([40, 50, 60])],
                           [2, np.array([20, 30, 40])]],
                          columns=['category', 'arraydata'])

The following will work (thats the test case):
result = df.groupby('category').agg(sum)

But this will fail:
result = df.groupby('category')["arraydata"].agg(sum)

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in f(self, **kwargs)
   1153                 except Exception:
   1154                     result = self.aggregate(
-> 1155                         lambda x: npfunc(x, axis=self.axis))
   1156                     if _convert:
   1157                         result = result._convert(datetime=True)

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2884             except Exception:
-> 2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
   2886 
   2887             index = Index(sorted(result), name=self.grouper.names[0])

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
   3015             output = func(group, *args, **kwargs)
   3016             if isinstance(output, (Series, Index, np.ndarray)):
-> 3017                 raise Exception('Must produce aggregated value')
   3018             result[name] = self._try_cast(output, group)
   3019 

Exception: Must produce aggregated value

Or in a similar case:

----> 1 g["mean"].agg(lambda x: np.mean(x))

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2878 
   2879             if self.grouper.nkeys > 1:
-> 2880                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2881 
   2882             try:

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
    846         for name, obj in self._iterate_slices():
    847             try:
--> 848                 result, counts = self.grouper.agg_series(obj, f)
    849                 output[name] = self._try_cast(result, obj, numeric_only=True)
    850             except TypeError:

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
   2178             return self._aggregate_series_fast(obj, func)
   2179         except Exception:
-> 2180             return self._aggregate_series_pure_python(obj, func)
   2181 
   2182     def _aggregate_series_fast(self, obj, func):

/home/data/lelausen/.local/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
   2213                 if (isinstance(res, (Series, Index, np.ndarray)) or
   2214                         isinstance(res, list)):
-> 2215                     raise ValueError('Function does not reduce')
   2216                 result = np.empty(ngroups, dtype='O')
   2217 

ValueError: Function does not reduce

Was this page helpful?
0 / 5 - 0 ratings