[x] I have checked that this issue has not already been reported.
[x] I have confirmed this bug exists on the latest version of pandas.
I checked in 1.1.1
[ ] (optional) I have confirmed this bug exists on the master branch of pandas.
In [11]: import pandas as pd
...: import numpy as np
...:
...: df = pd.DataFrame({
...: 'animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
...: 'type': [np.nan, np.nan, np.nan, np.nan],
...: 'speed': [380., 370., 24., 26.]
...: })
...: speed = df.groupby(['animal', 'type'], dropna=False)['speed'].first()
In [12]: speed
Out[12]:
animal type
Falcon NaN 380.0
Parrot NaN 24.0
Name: speed, dtype: float64
In [13]: speed.index.levels
Out[13]: FrozenList([['Falcon', 'Parrot'], [nan]])
In [14]: speed.index.codes
Out[14]: FrozenList([[0, 1], [0, 0]])
In [15]: # Reconstruct same index to allow for multiplication.
...: ix_wing = pd.MultiIndex.from_tuples(
...: [('Falcon', np.nan), ('Parrot', np.nan)], names=['animal', 'type']
...: )
...: wing = pd.Series([42, 44], index=ix_wing)
In [16]: wing
Out[16]:
animal type
Falcon NaN 42
Parrot NaN 44
dtype: int64
In [17]: wing.index.levels
Out[17]: FrozenList([['Falcon', 'Parrot'], []])
In [18]: wing.index.codes
Out[18]: FrozenList([[0, 1], [-1, -1]])
In [19]:
In [19]:
In [19]: speed * wing
Out[19]:
animal type
Falcon NaN NaN
NaN NaN
Parrot NaN NaN
NaN NaN
dtype: float64
I'm trying to perform combine two series (say multiplication for now). One of them is obtained by a groupby aggregation (say first) and the other series is constructed manually. Both series have a MultiIndex which _should_ be the same and a multiplication should work fine. However, it seems that groupby(..., dropna=False) creates a different MI which causes the operation to return an unexpected result.
I would expect the result of speed * wing to be
Falcon NaN 15960.0
Parrot NaN 1056.0
pd.show_versions()commit : f2ca0a2665b2d169c97de87b8e778dbed86aea07
python : 3.8.5.final.0
python-bits : 64
OS : Linux
OS-release : 5.7.17-200.fc32.x86_64
Version : #1 SMP Fri Aug 21 15:23:46 UTC 2020
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_AU.UTF-8
LOCALE : en_AU.UTF-8
pandas : 1.1.1
numpy : 1.18.4
pytz : 2019.3
dateutil : 2.8.1
pip : 20.2.1
setuptools : 47.3.1
Cython : 0.29.17
pytest : 5.1.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 0.9.6
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.8.3 (dt dec pq3 ext lo64)
jinja2 : 2.11.2
IPython : 7.17.0
pandas_datareader: None
bs4 : None
bottleneck : 1.3.1
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : None
numexpr : 2.7.1
odfpy : None
openpyxl : 1.8.6
pandas_gbq : None
pyarrow : 0.17.1
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : 1.3.12
tables : 3.6.1
tabulate : None
xarray : None
xlrd : 1.2.0
xlwt : None
numba : None
When recreating the MI of the groupby operation, the multiplication works as expected, so I suspect the groupby grouping MI creation is where it fails.
In [59]: speed = df.groupby(['animal', 'type'], dropna=False)['speed'].first()
...: speed.index = pd.MultiIndex.from_tuples(speed.index)
...:
...: speed * wing
Out[59]:
Falcon NaN 15960.0
Parrot NaN 1056.0
dtype: float64
as a workaround
In [68]: wing.reindex_like(speed) * speed
Out[68]:
animal type
Falcon NaN 15960.0
Parrot NaN 1056.0
dtype: float64
I'm not sure which representation is preferable (storing NaN in the levels and use regular codes, or not storing NaN and using -1). But we should be consistent. cc @topper-123.
Unsure if this is the same underlying issue, but it fits the title:
In[3] = pd.DataFrame(data={"a": [1, 2, 3, np.nan, 4], "b": ["a", "b", "c", "d", np.nan], "c": [0, 12, 23, 45, 56]})
In[4] df
a b c
0 1.0 a 0
1 2.0 b 12
2 3.0 c 23
3 NaN d 45
4 4.0 NaN 56
In[5] df.groupby(["a", "c"], dropna=False).sum().groupby(["a", "c"], dropna=True).sum()
b
a c
1.0 0 a
2.0 12 b
3.0 23 c
4.0 56 0
NaN 45 d
In [6]: idx = pd.MultiIndex.from_tuples([(1.0, 0), (2.0, 12), (3.0, 23), (4.0, 56), (np.nan, 45)], names=('a', 'b'))
In [8]: df2 = pd.DataFrame(["a", "b", "c", np.nan, "d"], index=idx)
In [9]: df2
0
a b
1.0 0 a
2.0 12 b
3.0 23 c
4.0 56 NaN
NaN 45 d
In [12]: df2.groupby(["a", "b"], dropna=True).first()
0
a b
1.0 0 a
2.0 12 b
3.0 23 c
4.0 56 NaN
After performing a groupby, a subsequent groupby with dropna=True will not drop NaNs from keys. I have also seen this in the opposite, where a subsequent groupby with dropna=False will drop NaNs, but I cannot reproduce this in a small example.
My rewriting of index workaround can cause another issue (reindex_like isn't applicable for my use case as the other operand of the arithmetic operation isn't available yet).
Pandas version: 1.1.2
In [12]: import pandas as pd
...: import numpy as np
...:
...: df = pd.DataFrame({
...: 'a': [pd.NaT, pd.NaT],
...: 'b': [3, 4],
...: 'c': [2.0, 3.0],
...: 'd': [5, 4]
...: })
...: dfg = df.groupby(by=['a', 'b', 'c'], dropna=False).first()
...: new_index = pd.MultiIndex.from_tuples(dfg.index, names=dfg.index.names)
...: dfg.index = new_index
...: dfg.reset_index()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-908344b5e83b> in <module>
11 new_index = pd.MultiIndex.from_tuples(dfg.index, names=dfg.index.names)
12 dfg.index = new_index
---> 13 dfg.reset_index()
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
4851 name = tuple(name_lst)
4852 # to ndarray and maybe infer different dtype
-> 4853 level_values = _maybe_casted_values(lev, lab)
4854 new_obj.insert(0, name, level_values)
4855
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/frame.py in _maybe_casted_values(index, labels)
4784 dtype = index.dtype
4785 fill_value = na_value_for_dtype(dtype)
-> 4786 values = construct_1d_arraylike_from_scalar(
4787 fill_value, len(mask), dtype
4788 )
~/envs/pandas-test/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1556
1557 subarr = np.empty(length, dtype=dtype)
-> 1558 subarr.fill(value)
1559
1560 return subarr
ValueError: cannot convert float NaN to integer
I reopened another issue providing more detail on the above.