[x] I have checked that this issue has not already been reported.
[x] I have confirmed this bug exists on the latest version of pandas.
[ ] (optional) I have confirmed this bug exists on the master branch of pandas.
Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.
# Your code here
def get_t_minus_n_val(n):
def f(x):
# assume index is days_from_origin
if n == 3 and x.index[-1] == 27:
import pdb; pdb.set_trace()
print(x.index[-1], n, x.get(x.index[-1] - n)) # for debugging
return x.get(x.index[-1] - n, np.NaN)
f.__name__ = f"t_minus_{n}_days"
return f
res = data_df.set_index("days_from_origin_").groupby("device").agg({"metric1": get_t_minus_n_val(3)})
> <ipython-input-452-9b903578cb56>(7)f()
-> print(x.index[-1], n, x.get(x.index[-1] - n)) # for debugging
(Pdb) x.get(24)
(Pdb) x.iloc[-5:]
23 60221064
24 232131096
25 46413584
26 133181464
27 229400712
Name: metric1, dtype: int64
(Pdb) 24 in x.index
False
(Pdb) 24 in x.index.values
True
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
dtype='int64')
(Pdb) x.to_dict().get(24)
232131096
I am not able to get a minimum repro of this as it is seems to be data dependent. Instead, I am capturing the bug by showing you my pdb debugging statements in hopes that someone that knows the code can figure out where the problem is.
Basically, I am doing a custom agg function which needs to grab an element from a Series object, and even though the value clearly exists in the index, it returns None. If I first convert to dict, then it does get the value.
I was not able to repro this by simply creating a new series and calling .get on it... that works just fine. And in fact, if I filter the dataframe for just that device, then it works just fine. It is definitely some sort of internal state issue which happens as a result of groupby having more records...
obviously I expect x.get(24) to return the correct value instead of None.
pd.show_versions()[paste the output of pd.show_versions() here leaving a blank line after the details tag]
commit : None
python : 3.7.6.final.0
python-bits : 64
OS : Darwin
OS-release : 17.7.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.0.1
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 20.0.2
setuptools : 41.2.0
Cython : None
pytest : 5.4.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.1
IPython : 7.13.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.3
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 0.16.0
pytables : None
pytest : 5.4.1
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : 0.48.0
expected behavior:
def test():
index = pd.Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
dtype='int64')
s = pd.Series(
[196878448, 221631200, 242635104, 19671288, 48754392, 165167288, 40217696,
157245440, 2007336, 27386952, 49185256, 71049952, 89324416, 108032776,
128883968, 166147504, 220369328, 26889400, 16711448, 81018648, 231554800,
162081072, 204644960, 60221064, 232131096, 46413584, 133181464, 229400712],
index=index)
import pdb; pdb.set_trace()
print(s.get(24)) # Works!
test()
> <ipython-input-470-6918416e70f7>(12)test()
-> print(s.get(24))
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3770)get()
-> def get(self, key, default=None):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3784)get()
-> try:
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3785)get()
-> return self[key]
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(868)__getitem__()
-> def __getitem__(self, key):
(Pdb) s
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(869)__getitem__()
-> key = com.apply_if_callable(key, self)
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(870)__getitem__()
-> try:
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(871)__getitem__()
-> result = self.index.get_value(self, key)
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4373)get_value()
-> @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4379)get_value()
-> s = extract_array(series, extract_numpy=True)
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(337)extract_array()
-> def extract_array(obj, extract_numpy=False):
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(379)extract_array()
-> if isinstance(obj, (ABCIndexClass, ABCSeries)):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(380)extract_array()
-> obj = obj.array
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(382)extract_array()
-> if extract_numpy and isinstance(obj, ABCPandasArray):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(383)extract_array()
-> obj = obj.to_numpy()
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(385)extract_array()
-> return obj
(Pdb) obj
array([196878448, 221631200, 242635104, 19671288, 48754392, 165167288,
40217696, 157245440, 2007336, 27386952, 49185256, 71049952,
89324416, 108032776, 128883968, 166147504, 220369328, 26889400,
16711448, 81018648, 231554800, 162081072, 204644960, 60221064,
232131096, 46413584, 133181464, 229400712])
(Pdb) n
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(385)extract_array()->array([196878...4, 229400712])
-> return obj
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4380)get_value()
-> if isinstance(s, ExtensionArray):
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4400)get_value()
-> s = com.values_from_object(series)
(Pdb) print(s)
[196878448 221631200 242635104 19671288 48754392 165167288 40217696
157245440 2007336 27386952 49185256 71049952 89324416 108032776
128883968 166147504 220369328 26889400 16711448 81018648 231554800
162081072 204644960 60221064 232131096 46413584 133181464 229400712]
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4401)get_value()
-> k = com.values_from_object(key)
(Pdb) print(s)
[196878448 221631200 242635104 19671288 48754392 165167288 40217696
157245440 2007336 27386952 49185256 71049952 89324416 108032776
128883968 166147504 220369328 26889400 16711448 81018648 231554800
162081072 204644960 60221064 232131096 46413584 133181464 229400712]
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4403)get_value()
-> k = self._convert_scalar_indexer(k, kind="getitem")
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4404)get_value()
-> try:
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4405)get_value()
-> return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
(Pdb) print(s)
[196878448 221631200 242635104 19671288 48754392 165167288 40217696
157245440 2007336 27386952 49185256 71049952 89324416 108032776
128883968 166147504 220369328 26889400 16711448 81018648 231554800
162081072 204644960 60221064 232131096 46413584 133181464 229400712]
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(602)_engine()
-> @cache_readonly
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(608)_engine()
-> _ndarray_values = self._ndarray_values
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)_engine()
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb) _ndarray_values
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])
(Pdb) n
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)_engine()-><pandas._libs...t 0x1fddcaef0>
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb)
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(414)dtype()
-> @property
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(419)dtype()
-> return self._data.dtype
(Pdb)
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(419)dtype()->dtype('int64')
-> return self._data.dtype
(Pdb) n
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb)
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()->array([ 0, 1..., 25, 26, 27])
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb)
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4405)get_value()->232131096
-> return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(873)__getitem__()
-> if not is_scalar(result):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(885)__getitem__()
-> return result
(Pdb)
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(885)__getitem__()->232131096
-> return result
(Pdb) list
880 result = self._constructor(
881 result, index=[key] * len(result), dtype=self.dtype
882 ).__finalize__(self)
883 except KeyError:
884 pass
885 -> return result
886 except InvalidIndexError:
887 pass
888 except (KeyError, ValueError):
889 if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
890 # kludge
(Pdb) result
232131096
(Pdb) c
232131096
this is the full step-through of the code that gives the unexpected None:
def get_t_minus_n_val(n):
def f(x):
# assume index is days_from_origin
if n == 3 and x.index[-1] == 27:
import pdb; pdb.set_trace()
print(x.get(24))
return x.get(x.index[-1] - n, np.NaN)
f.__name__ = f"t_minus_{n}_days"
return f
res = data_df.set_index("days_from_origin_").groupby("device").agg({"metric1": get_t_minus_n_val(3)})
> <ipython-input-476-3679074309c0>(7)f()
-> print(x.get(24))
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3770)get()
-> def get(self, key, default=None):
(Pdb) s
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3784)get()
-> try:
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/generic.py(3785)get()
-> return self[key]
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(868)__getitem__()
-> def __getitem__(self, key):
(Pdb) s
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(869)__getitem__()
-> key = com.apply_if_callable(key, self)
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(870)__getitem__()
-> try:
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(871)__getitem__()
-> result = self.index.get_value(self, key)
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4373)get_value()
-> @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4379)get_value()
-> s = extract_array(series, extract_numpy=True)
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(337)extract_array()
-> def extract_array(obj, extract_numpy=False):
(Pdb) s
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(379)extract_array()
-> if isinstance(obj, (ABCIndexClass, ABCSeries)):
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(380)extract_array()
-> obj = obj.array
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(382)extract_array()
-> if extract_numpy and isinstance(obj, ABCPandasArray):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(383)extract_array()
-> obj = obj.to_numpy()
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(385)extract_array()
-> return obj
(Pdb) obj
array([196878448, 221631200, 242635104, 19671288, 48754392, 165167288,
40217696, 157245440, 2007336, 27386952, 49185256, 71049952,
89324416, 108032776, 128883968, 166147504, 220369328, 26889400,
16711448, 81018648, 231554800, 162081072, 204644960, 60221064,
232131096, 46413584, 133181464, 229400712])
(Pdb) s
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/construction.py(385)extract_array()->array([196878...4, 229400712])
-> return obj
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4380)get_value()
-> if isinstance(s, ExtensionArray):
(Pdb) print(s)
[196878448 221631200 242635104 19671288 48754392 165167288 40217696
157245440 2007336 27386952 49185256 71049952 89324416 108032776
128883968 166147504 220369328 26889400 16711448 81018648 231554800
162081072 204644960 60221064 232131096 46413584 133181464 229400712]
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4400)get_value()
-> s = com.values_from_object(series)
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4401)get_value()
-> k = com.values_from_object(key)
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4403)get_value()
-> k = self._convert_scalar_indexer(k, kind="getitem")
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4404)get_value()
-> try:
(Pdb) k
24
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4405)get_value()
-> return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
(Pdb) s
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(414)dtype()
-> @property
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(419)dtype()
-> return self._data.dtype
(Pdb) self._data.dtype
dtype('int64')
(Pdb) n
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/series.py(419)dtype()->dtype('int64')
-> return self._data.dtype
(Pdb) n
--Call--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb) _ndarray_values
array([0, 1, 2, 3, 4, 5])
(Pdb) len(self)
*** NameError: name 'self' is not defined
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb) n
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(609)<lambda>()->array([0, 1, 2, 3, 4, 5])
-> return self._engine_type(lambda: _ndarray_values, len(self))
(Pdb) n
KeyError: 24
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4405)get_value()
-> return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
(Pdb) print(s)
[196878448 221631200 242635104 19671288 48754392 165167288 40217696
157245440 2007336 27386952 49185256 71049952 89324416 108032776
128883968 166147504 220369328 26889400 16711448 81018648 231554800
162081072 204644960 60221064 232131096 46413584 133181464 229400712]
(Pdb) print(k)
24
(Pdb) n
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4406)get_value()
-> except KeyError as e1:
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4407)get_value()
-> if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
(Pdb)
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4408)get_value()
-> raise
(Pdb)
--Return--
> /Users/sam/.pyenv/versions/3.7.6/envs/pip3env/lib/python3.7/site-packages/pandas/core/indexes/base.py(4408)get_value()->None
-> raise
(Pdb) c
None
@sam-cohan I can't directly help based on the above, but if you could test with the latest release or pandas master, that would also be helpful (there have been some fixes related to groupby recently)
For getting a reproducible example:
I was not able to repro this by simply creating a new series and calling .get on it... that works just fine. And in fact, if I filter the dataframe for just that device, then it works just fine. It is definitely some sort of internal state issue which happens as a result of groupby having more records...
I would indeed try to create a reproducibe example with the groupby operation, not the isolated .get, as it is indeed quite likely to be related to some groupby internals.
I suppose you can't share the original data you have are experiencing this with. So some tips, as you can typically create similar data that you can share. For example, start with making your data smaller (eg taking a part of it) and see if you still get the error. Remove all columns you don't need to reproduce the example. Use dummy names for the columns. Try to replace the values in the columns with dummy data but that have the same characteristics. Etc.
Thanks for the feedback. I will try to see if I can get a subset of the data to provide a repro.
@jorisvandenbossche I have updated the description with min repo. Please let me know if it is not clear.
Hmm. Strange, seems like I cannot update that. Here is minimum repro code:
import pandas as pd
data_df = pd.read_csv("http://aws-proserve-data-science.s3.amazonaws.com/predictive_maintenance.csv")
data_df["date"] = pd.to_datetime(data_df["date"])
data_df["days_from_origin"] = (data_df["date"] - data_df["date"].min()).dt.days
def get_t_minus_n_val(n):
def f(x):
# assume index is days_from_origin
if x.index[-1] == 27:
print(x.iloc[-n-1:])
idx = x.index[-1] - n
print(x.get(idx), x.to_dict().get(idx)) # x.get(idx) is None but x.to_dict().get(idx) is not None
return x.get(x.index[-1] - n, None)
f.__name__ = f"t_minus_{n}_days"
return f
res = data_df.set_index("days_from_origin").groupby("device").agg({"metric1": get_t_minus_n_val(3)})
@sam-cohan thanks for the reproducer!
So the good new is that this is fixed again on master (it prints "232131096 232131096" instead of "None 232131096"). But it would still be good to add a test to ensure this keeps working. For a test we still need a more reduced example though (small dataframe that can be created in the tests).
@jorisvandenbossche here is proper minimum repro:
def get_t_minus_n_val(n):
def f(x):
print(x.iloc[-n-1:])
idx = x.index[-1] - n
# x.get(idx) is None but x.to_dict().get(idx) is not None
print(
f"idx={idx}"
f", x.get(idx)={x.get(idx)}"
f", x.to_dict().get(idx)={x.to_dict().get(idx)}")
return x.get(x.index[-1] - n, None)
f.__name__ = f"t_minus_{n}_days"
return f
df = pd.DataFrame({
"A": [
'S', 'W', 'S', 'W', 'S', 'W', 'S', 'W', 'S', 'W', 'S', 'W',
'W', 'W', 'W','W'
],
"B": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
7, 7, 7, 7],
}, index=[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9],
)
df.groupby("A").agg(
{"B": get_t_minus_n_val(3)}
)
Interestingly, if you rename 'S' to "foo" and 'W' to "bar", it won't have the bug, so the values of the string column "A" are significant to the bug!
Better still:
df = pd.DataFrame({
"A": ["S", "W", "W"],
"B": [1.0, 1.0, 2.0],
})
res = df.groupby("A").agg(
{"B": lambda x: x.get(x.index[-1])}
)
expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A")
pd.testing.assert_frame_equal(res, expected)
@jorisvandenbossche @simonjayhawkins I added a PR with the test. Who can I follow up with to get it merged?
So the good new is that this is fixed again on master
fixed in #32611 (i.e. 1.1)
fa48f5f098b5ed49a08b9d19f072559f74e1ccc2 is the first new commit
commit fa48f5f098b5ed49a08b9d19f072559f74e1ccc2
Author: jbrockmendel jbrockmendel@gmail.com
Date: Wed Mar 11 21:30:02 2020 -0700
REF: implement _get_engine_target (#32611)
@sam-cohan thanks for the better reproducible example and the PR!