from io import StringIO
import pandas as pd
data = '''Date,Amount
10/30/2010,54
11/20/2010,53'''
df = pd.read_csv(StringIO(data), parse_dates=[0])
df.groupby([df.Date.dt.year, df.Date.dt.month]).mean()
This worked prior to 0.23 and seems like a regression. I now get the following error:
ValueError Traceback (most recent call last)
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
4017 result, _ = self.grouper.aggregate(
-> 4018 block.values, how, axis=agg_axis, min_count=min_count)
4019 except NotImplementedError:
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, values, how, axis, min_count)
2626 return self._cython_operation('aggregate', values, how, axis,
-> 2627 min_count=min_count)
2628
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_operation(self, kind, values, how, axis, min_count, **kwargs)
2532 "supported for the 'how' argument")
-> 2533 out_shape = (self.ngroups,) + values.shape[1:]
2534
pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in ngroups(self)
2361 def ngroups(self):
-> 2362 return len(self.result_index)
2363
pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in result_index(self)
2380 verify_integrity=False,
-> 2381 names=self.names)
2382 return result
~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
231 # handles name validation
--> 232 result._set_names(names)
233
~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
694 'level {}, is already used for level '
--> 695 '{}.'.format(name, l, used[name]))
696
ValueError: Duplicated level name: "Date", assigned to level 1, is already used for level 0.
During handling of the above exception, another exception occurred:
UnboundLocalError Traceback (most recent call last)
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, *args, **kwargs)
1305 try:
-> 1306 return self._cython_agg_general('mean', **kwargs)
1307 except GroupByError:
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
3973 new_items, new_blocks = self._cython_agg_blocks(
-> 3974 how, alt=alt, numeric_only=numeric_only, min_count=min_count)
3975 return self._wrap_agged_blocks(new_items, new_blocks)
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
4038 # see if we can cast the block back to the original dtype
-> 4039 result = block._try_coerce_and_cast_result(result)
4040 newb = block.make_block(result)
UnboundLocalError: local variable 'result' referenced before assignment
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-13-4b644a75a9ce> in <module>()
10 df.dtypes
11
---> 12 df.groupby([df.Date.dt.year, df.Date.dt.month]).mean()
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, *args, **kwargs)
1310 with _group_selection_context(self):
1311 f = lambda x: x.mean(axis=self.axis, **kwargs)
-> 1312 return self._python_agg_general(f)
1313
1314 @Substitution(name='groupby')
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
1086 output[name] = self._try_cast(values[mask], result)
1087
-> 1088 return self._wrap_aggregated_output(output)
1089
1090 def _wrap_applied_output(self, *args, **kwargs):
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _wrap_aggregated_output(self, output, names)
4728 result = result._consolidate()
4729 else:
-> 4730 index = self.grouper.result_index
4731 result = DataFrame(output, index=index, columns=output_keys)
4732
pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~/.env/36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in result_index(self)
2379 labels=labels,
2380 verify_integrity=False,
-> 2381 names=self.names)
2382 return result
2383
~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
230 if names is not None:
231 # handles name validation
--> 232 result._set_names(names)
233
234 if sortorder is not None:
~/.env/36/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
693 'Duplicated level name: "{}", assigned to '
694 'level {}, is already used for level '
--> 695 '{}.'.format(name, l, used[name]))
696
697 self.levels[l].rename(name, inplace=True)
ValueError: Duplicated level name: "Date", assigned to level 1, is already used for level 0.
Date Date
2010 10 54
11 53
pd.show_versions()commit: None
python: 3.6.4.final.0
python-bits: 64
OS: Darwin
OS-release: 16.5.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8
pandas: 0.23.0
pytest: 3.0.6
pip: 10.0.1
setuptools: 39.2.0
Cython: 0.27.3
numpy: 1.14.3
scipy: 1.1.0
pyarrow: None
xarray: None
IPython: 6.2.1
sphinx: 1.7.5
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: 2.4.7
xlrd: 1.0.0
xlwt: 1.2.0
xlsxwriter: None
lxml: 3.7.2
bs4: 4.6.0
html5lib: 0.9999999
sqlalchemy: 1.2.5
pymysql: None
psycopg2: 2.7.4 (dt dec pq3 ext lo64)
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
Thanks for the report. This is a dupe of #21075 (see comments for a workaround), which was closed in favor of #19029.
(My bad with the search fu). It seems to me that pulling off a date time attribute should change the column name. I see the pr that tacks on 1,2,etc, but wouldn't it be better to tack on "_month", "_hour", etc?
No worries. Would rather have two dupes than no issue being reported at all!
For the workaround I was referring https://github.com/pandas-dev/pandas/issues/21075#issuecomment-389484883, or in you case:
df.groupby([df.Date.dt.year.rename('year'), df.Date.dt.month.rename(month')]).mean()
I suspect the PR is tacking on 0,1,... since that handles things most generically, and it corresponds to the level enumeration of a MultiIndex.
As to why something like df.Date.dt.year doesn't automatically change the name? I don't know for sure, but I suspect it's a combination of:
1) I don't know of a well agreed upon convention of how to rename (rename entirely, add suffixes, etc.)
2) There's a rename method that's easily chainable to explicitly accommodate any desired renaming
3) Probably best to retain existing behavior in the absence of a clearly better alternative
That being said, feel free to open a new issue if you feel strongly about automatically renaming these things.
Most helpful comment
No worries. Would rather have two dupes than no issue being reported at all!
For the workaround I was referring https://github.com/pandas-dev/pandas/issues/21075#issuecomment-389484883, or in you case:
I suspect the PR is tacking on 0,1,... since that handles things most generically, and it corresponds to the level enumeration of a
MultiIndex.As to why something like
df.Date.dt.yeardoesn't automatically change the name? I don't know for sure, but I suspect it's a combination of:1) I don't know of a well agreed upon convention of how to rename (rename entirely, add suffixes, etc.)
2) There's a
renamemethod that's easily chainable to explicitly accommodate any desired renaming3) Probably best to retain existing behavior in the absence of a clearly better alternative
That being said, feel free to open a new issue if you feel strongly about automatically renaming these things.