Describe the bug
I am trying to repartition a dataframe of 4.800000032 GB but it throws RMM_ERROR_OUT_OF_MEMORY on the latest nightly but it worked on 6ths nightly, before libcudf++ merge on a 16 GB card.
I measured the peak memory (on a 32 gb card) and it is 16604 MB vs 9692 MB .
On the real workflow, I can repartition successfully on a 16gb card on 6ths january but cant
even do it on a 32gb card on the latest nightly.
Steps/Code to reproduce bug
import dask_cudf
import cudf
import numpy as np
import cudf.utils.cudautils as cudf_cudautils
n_rows = 300_000_000
final = dask_cudf.from_cudf(cudf.DataFrame({'a':cudf_cudautils.ones(n_rows, dtype=np.int64),
'b':cudf_cudautils.ones(n_rows, dtype=np.int64)})
,npartitions=50)
final = final.persist()
print("persisted = {}".format(len(final)))
final = final.repartition(npartitions=1).persist()
print("Length after repartition = {}".format(len(final)))
Expected behavior
I would expect it to work like it does on 6ths (before libcudf++ ) nightly.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-1-6f42141390fe> in <module>
14 print("persisted = {}".format(len(final)))
15
---> 16 final = final.repartition(npartitions=1).persist()
17 print("Length after repartition = {}".format(len(final)))
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/base.py in persist(self, **kwargs)
136 dask.base.persist
137 """
--> 138 (result,) = persist(self, traverse=False, **kwargs)
139 return result
140
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/base.py in persist(*args, **kwargs)
627 postpersists.append((rebuild, a_keys, state))
628
--> 629 results = schedule(dsk, keys, **kwargs)
630 d = dict(zip(keys, results))
631 results2 = [r({k: d[k] for k in ks}, *s) for r, ks, s in postpersists]
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in get_sync(dsk, keys, **kwargs)
525 """
526 kwargs.pop("num_workers", None) # if num_workers present, remove it
--> 527 return get_async(apply_sync, 1, dsk, keys, **kwargs)
528
529
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
469 # Seed initial tasks into the thread pool
470 while state["ready"] and len(state["running"]) < num_workers:
--> 471 fire_task()
472
473 # Main loop, wait on tasks to finish, insert new ones
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in fire_task()
464 pack_exception,
465 ),
--> 466 callback=queue.put,
467 )
468
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in apply_sync(func, args, kwds, callback)
514 def apply_sync(func, args=(), kwds={}, callback=None):
515 """ A naive synchronous version of apply_async """
--> 516 res = func(*args, **kwds)
517 if callback is not None:
518 callback(res)
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
225 failed = False
226 except BaseException as e:
--> 227 result = pack_exception(e, dumps)
228 failed = True
229 return key, result, failed
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask/dataframe/methods.py in concat(dfs, axis, join, uniform, filter_warning)
354 func = concat_dispatch.dispatch(type(dfs[0]))
355 return func(
--> 356 dfs, axis=axis, join=join, uniform=uniform, filter_warning=filter_warning
357 )
358
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/dask_cudf/backends.py in concat_cudf(dfs, axis, join, uniform, filter_warning, sort)
32 ):
33 assert join == "outer"
---> 34 return cudf.concat(dfs, axis=axis)
35
36
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/reshape.py in concat(objs, axis, ignore_index, sort)
88
89 if typ is DataFrame:
---> 90 return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index)
91 elif typ is Series:
92 return Series._concat(objs, axis=axis)
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/dataframe.py in _concat(cls, objs, axis, ignore_index)
1928 out = cls(data)
1929
-> 1930 out.index = index
1931
1932 if isinstance(objs[0].columns, cudf.MultiIndex):
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/dataframe.py in __setattr__(self, key, col)
361 try:
362 object.__getattribute__(self, key)
--> 363 object.__setattr__(self, key, col)
364 return
365 except AttributeError:
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/dataframe.py in index(self, value)
1284
1285 # try to build an index from generic _index
-> 1286 idx = as_index(value)
1287 self._index = idx
1288
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/index.py in as_index(arbitrary, **kwargs)
1025
1026 if isinstance(arbitrary, Index):
-> 1027 idx = arbitrary.copy(deep=False)
1028 idx.rename(**kwargs, inplace=True)
1029 return idx
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/index.py in copy(self, deep)
726 result = as_index(self.as_column().copy(deep=True))
727 else:
--> 728 result = as_index(self.as_column().copy())
729 result.name = self.name
730 return result
/raid/vjawa/conda/conda_installation/envs/cudf_12_15_jan/lib/python3.7/site-packages/cudf/core/column/column.py in copy(self, deep)
360 """
361 if deep:
--> 362 return libcudf.copying.copy_column(self)
363 else:
364 return build_column(
cudf/_lib/copying.pyx in cudf._lib.copying.copy_column()
RuntimeError: RMM error encountered at: /conda/conda-bld/libcudf_1579141519220/work/cpp/src/column/legacy/column.cpp:206: 4 RMM_ERROR_OUT_OF_MEMORY
Environment overview (please complete the following information)
Environment details
It fails on below environment:
cudf 0.12.0b200116 py37_1385 rapidsai-nightly
dask-cudf 0.12.0b200116 py37_1385 rapidsai-nightly
libcudf 0.12.0b200116 cuda10.0_1385 rapidsai-nightly
It works on below environment:
cudf 0.12.0b200106 py37_585 rapidsai-nightly
dask-cudf 0.12.0b200106 py37_585 rapidsai-nightly
libcudf 0.12.0b200116 cuda10.0_1422 rapidsai-nightly
CC: @pentschev , because you were seeing similar problems.
CC: @beckernick , @ayushdg , @shwina
cc @shwina
Not sure if it's the issue, but it looks like we're deep copying indexes erroneously here, we should be passing deep=False to that copy call.
I think I might have assumed False to be the default - possibly in other places as well. Will audit and fix.
Great catch @kkraus14. Making it copy(deep=False) brings the peak to 11988 MB, which a 16gb card should be able to accommodate. @VibhuJawa would that suffice?
Do we want to re-open this @VibhuJawa @shwina ?