Describe the bug
I'd like to convert a date column to datetime format using dask-cudf map_partitions but it yields the error below. Dask does not understand cudf's datetime dtype.
Steps/Code to reproduce bug
import cudf
import dask_cudf
cdf = cudf.DataFrame()
cdf['pickup_datetime'] = ['2015-01-15 19:05:39', '2015-01-10 20:33:38', '2015-01-10 20:33:38']
ddf=dask_cudf.from_cudf(cdf, npartitions=2)
def clean(df_part):
df_part['pickup_datetime' = df_part['pickup_datetime'].astype('datetime64[s]')
return df_part
ddf = ddf.map_partitions(clean)
This yields the error below:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
171 try:
--> 172 yield
173 except Exception as e:
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
4940 with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4941 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
4942
<ipython-input-46-cc6c5c93bcd7> in clean(df_part)
5 def clean(df_part):
----> 6 df_part['pickup_datetime'] = df_part['pickup_datetime'].astype('datetime64[s]')
7 return df_part
~/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/series.py in astype(self, dtype, errors, **kwargs)
1358 if errors == "raise":
-> 1359 raise e
1360 elif errors == "warn":
~/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/series.py in astype(self, dtype, errors, **kwargs)
1354 return self._copy_construct(
-> 1355 data=self._column.astype(dtype, **kwargs)
1356 )
~/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/column.py in astype(self, dtype, **kwargs)
771 elif np.issubdtype(dtype, np.datetime64):
--> 772 return self.as_datetime_column(dtype, **kwargs)
773
~/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/string.py in as_datetime_column(self, dtype, **kwargs)
2121 def as_datetime_column(self, dtype, **kwargs):
-> 2122 return self.as_numerical_column(dtype, **kwargs)
2123
~/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/string.py in as_numerical_column(self, dtype, **kwargs)
2118
-> 2119 return _str_to_numeric_typecast_functions[str_dtype](self, **kwargs)
2120
cudf/_libxx/string_casting.pyx in cudf._libxx.string_casting.timestamp2int()
AttributeError: 'NoneType' object has no attribute 'encode'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-46-cc6c5c93bcd7> in <module>
7 return df_part
8
----> 9 ddf = ddf.map_partitions(clean)
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/core.py in map_partitions(self, func, *args, **kwargs)
636 >>> ddf.map_partitions(func).clear_divisions() # doctest: +SKIP
637 """
--> 638 return map_partitions(func, self, *args, **kwargs)
639
640 @insert_meta_param_description(pad=12)
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/core.py in map_partitions(func, meta, enforce_metadata, transform_divisions, *args, **kwargs)
4989 # Use non-normalized kwargs here, as we want the real values (not
4990 # delayed values)
-> 4991 meta = _emulate(func, *args, udf=True, **kwargs)
4992 else:
4993 meta = make_meta(meta, index=meta_index)
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
4939 """
4940 with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4941 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
4942
4943
~/yes/envs/xgb/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
~/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
191 )
192 msg = msg.format(" in `{0}`".format(funcname) if funcname else "", repr(e), tb)
--> 193 raise ValueError(msg)
194
195
ValueError: Metadata inference failed in `clean`.
You have supplied a custom function and Dask is unable to
determine the type of output that that function returns.
To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.
Original error is below:
------------------------
AttributeError("'NoneType' object has no attribute 'encode'")
Traceback:
---------
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/utils.py", line 172, in raise_on_meta_error
yield
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/dask/dataframe/core.py", line 4941, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "<ipython-input-46-cc6c5c93bcd7>", line 6, in clean
df_part['pickup_datetime'] = df_part['pickup_datetime'].astype('datetime64[s]')
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/series.py", line 1359, in astype
raise e
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/series.py", line 1355, in astype
data=self._column.astype(dtype, **kwargs)
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/column.py", line 772, in astype
return self.as_datetime_column(dtype, **kwargs)
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/string.py", line 2122, in as_datetime_column
return self.as_numerical_column(dtype, **kwargs)
File "/home/ronaya/yes/envs/xgb/lib/python3.7/site-packages/cudf/core/column/string.py", line 2119, in as_numerical_column
return _str_to_numeric_typecast_functions[str_dtype](self, **kwargs)
File "cudf/_libxx/string_casting.pyx", line 416, in cudf._libxx.string_casting.timestamp2int
md5-1b4bcd040bb71a2ad3ec6e3bca60f0db
df = cudf.DataFrame()
df['pickup_datetime'] = ['2015-01-15 19:05:39', '2015-01-10 20:33:38', '2015-01-10 20:33:38']
df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64[s]')
ddf = dask_cudf.from_cudf(df, npartitions=2)
ddf.dtypes
pickup_datetime datetime64[s]
dtype: object
Environment overview (please complete the following information)
This is failing because dask is expecting a non-empty frame here, which returns something like following
[ pickup_datetime
0 cat
1 dog]
and as cat and dog are not any sort of datetime format, cudf fails to parse the format.
@rgsl888prabhu Thanks. We tested the same exp with npartitions=1 and it yields the same error.
@rnyak can you please try with the following modification to your sample repro,
ddf = ddf.map_partitions(clean, meta=ddf)
This should resolve your issues.
@rgsl888prabhu Thanks, but it does not solve the issue. Getting this after ddf = ddf.map_partitions(clean, meta=ddf)
ddf.dtypes
pickup_datetime object
dtype: object
And also, AFAIK, the usage of meta arg should be done in a different way.
@rnyak https://github.com/dask/dask/issues/6078, as per dask we have to provide meta data to overcome this scenario.
@rgsl888prabhu I don't think that's accurate. Consider this example:
>>> import cudf, dask_cudf
>>> df = cudf.DataFrame()
>>> df['val'] = [0, 1, 2]
>>>
>>> def add_one(df):
... df['val'] = df['val'] + 1
... return df
...
>>> ddf = dask_cudf.from_cudf(df, npartitions=1)
>>> ddf.map_partitions(add_one).compute()
val
0 1
1 2
2 3
As long as you return a valid cuDF DataFrame, Dask can understand its metadata without making the user specify meta.
The problem @rnyak is highlighting is that Dask (or dask_cudf) doesn't seem to understand a cudf Series created by the call to .astype('datetime64[s]').
@rgsl888prabhu Thanks. Referring the dask/dask#6078 I am closing this issue.