Hey team,
I've been trying to debug this memory allocation issue for a while now, but can't seem to find a way out.
Firstly, on what I'm trying to do - I'm trying to implement a Bayesian NN using PyMC3. An example notebook is here.
Next up, my environment:
pygpu 0.6.8libgpuarray 0.6.8The error I keep getting when I run the aforementioned notebook keeps cropping up at the stage when sampling from the PPC:
with model:
samp_ppc = pm.sample_ppc(trace, samples=100)
Error message is:
0%| | 0/100 [00:00<?, ?it/s]Problem occurred during compilation with the command line below:
/usr/bin/g++ -shared -g -O3 -fno-math-errno -Wno-unused-label -Wno-unused-variable -Wno-write-strings -march=broadwell -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mbmi2 -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi -mno-clwb -mno-pcommit -mno-mwaitx --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=8192 -mtune=generic -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION -m64 -fPIC -I/home/ericmjl/anaconda/envs/bayesian/lib/python3.6/site-packages/pygpu -I/home/ericmjl/anaconda/envs/bayesian/lib/python3.6/site-packages/numpy/core/include -I/home/ericmjl/anaconda/envs/bayesian/include -I/home/ericmjl/anaconda/envs/bayesian/lib/python3.6/site-packages/numpy/core/include -I/home/ericmjl/anaconda/envs/bayesian/include/python3.6m -I/home/ericmjl/github/software/Theano/theano/gof -L/home/ericmjl/anaconda/envs/bayesian/lib -L/home/ericmjl/anaconda/envs/bayesian/lib -fvisibility=hidden -o /home/ericmjl/.theano/compiledir_Linux-4.10--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/tmp595s6h99/ma82e960c1010cb6654d49168be28c6b4.so /home/ericmjl/.theano/compiledir_Linux-4.10--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/tmp595s6h99/mod.cpp -lgpuarray -lpython3.6m
ERROR (theano.gof.cmodule): [Errno 12] Cannot allocate memory
Stack trace looks like this:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-16-f76c5aea1bad> in <module>()
1 with model:
----> 2 samp_ppc = pm.sample_ppc(trace, samples=100)
~/github/software/pymc3/pymc3/sampling.py in sample_ppc(trace, samples, model, vars, size, random_seed, progressbar)
533 for var in vars:
534 ppc[var.name].append(var.distribution.random(point=param,
--> 535 size=size))
536
537 except KeyboardInterrupt:
~/github/software/pymc3/pymc3/distributions/multivariate.py in random(self, point, size)
509
510 def random(self, point=None, size=None):
--> 511 n, p = draw_values([self.n, self.p], point=point)
512 samples = generate_samples(self._random, n, p,
513 dist_shape=self.shape,
~/github/software/pymc3/pymc3/distributions/distribution.py in draw_values(params, point)
192 values = []
193 for param in params:
--> 194 values.append(_draw_value(param, point=point, givens=givens.values()))
195 return values
196
~/github/software/pymc3/pymc3/distributions/distribution.py in _draw_value(param, point, givens)
256 else:
257 variables = values = []
--> 258 func = _compile_theano_function(param, variables)
259 return func(*values)
260 else:
~/github/software/pymc3/pymc3/memoize.py in memoizer(*args, **kwargs)
14
15 if key not in cache:
---> 16 cache[key] = obj(*args, **kwargs)
17
18 return cache[key]
~/github/software/pymc3/pymc3/distributions/distribution.py in _compile_theano_function(param, vars, givens)
218 rebuild_strict=True,
219 on_unused_input='ignore',
--> 220 allow_input_downcast=True)
221
222
~/github/software/Theano/theano/compile/function.py in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
324 on_unused_input=on_unused_input,
325 profile=profile,
--> 326 output_keys=output_keys)
327 # We need to add the flag check_aliased inputs if we have any mutable or
328 # borrowed used defined inputs
~/github/software/Theano/theano/compile/pfunc.py in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
484 accept_inplace=accept_inplace, name=name,
485 profile=profile, on_unused_input=on_unused_input,
--> 486 output_keys=output_keys)
487
488
~/github/software/Theano/theano/compile/function_module.py in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1807 output_keys=output_keys)
1808 with theano.configparser.change_flags(compute_test_value="off"):
-> 1809 fn = m.create(defaults)
1810 finally:
1811 t2 = time.time()
~/github/software/Theano/theano/compile/function_module.py in create(self, input_storage, trustme, storage_map)
1671 theano.config.traceback.limit = theano.config.traceback.compile_limit
1672 _fn, _i, _o = self.linker.make_thunk(
-> 1673 input_storage=input_storage_lists, storage_map=storage_map)
1674 finally:
1675 theano.config.traceback.limit = limit_orig
~/github/software/Theano/theano/gof/link.py in make_thunk(self, input_storage, output_storage, storage_map)
697 return self.make_all(input_storage=input_storage,
698 output_storage=output_storage,
--> 699 storage_map=storage_map)[:3]
700
701 def make_all(self, input_storage, output_storage):
~/github/software/Theano/theano/gof/vm.py in make_all(self, profiler, input_storage, output_storage, storage_map)
1082 compute_map,
1083 [],
-> 1084 impl=impl))
1085 linker_make_thunk_time[node] = time.time() - thunk_start
1086 if not hasattr(thunks[-1], 'lazy'):
~/github/software/Theano/theano/gof/op.py in make_thunk(self, node, storage_map, compute_map, no_recycling, impl)
953 try:
954 return self.make_c_thunk(node, storage_map, compute_map,
--> 955 no_recycling)
956 except (NotImplementedError, utils.MethodNotDefined):
957 # We requested the c code, so don't catch the error.
~/github/software/Theano/theano/gof/op.py in make_c_thunk(self, node, storage_map, compute_map, no_recycling)
856 _logger.debug('Trying CLinker.make_thunk')
857 outputs = cl.make_thunk(input_storage=node_input_storage,
--> 858 output_storage=node_output_storage)
859 thunk, node_input_filters, node_output_filters = outputs
860
~/github/software/Theano/theano/gof/cc.py in make_thunk(self, input_storage, output_storage, storage_map, keep_lock)
1211 cthunk, module, in_storage, out_storage, error_storage = self.__compile__(
1212 input_storage, output_storage, storage_map,
-> 1213 keep_lock=keep_lock)
1214
1215 res = _CThunk(cthunk, init_tasks, tasks, error_storage, module)
~/github/software/Theano/theano/gof/cc.py in __compile__(self, input_storage, output_storage, storage_map, keep_lock)
1151 output_storage,
1152 storage_map,
-> 1153 keep_lock=keep_lock)
1154 return (thunk,
1155 module,
~/github/software/Theano/theano/gof/cc.py in cthunk_factory(self, error_storage, in_storage, out_storage, storage_map, keep_lock)
1607 node.op.prepare_node(node, storage_map, None, 'c')
1608 module = get_module_cache().module_from_key(
-> 1609 key=key, lnk=self, keep_lock=keep_lock)
1610
1611 vars = self.inputs + self.outputs + self.orphans
~/github/software/Theano/theano/gof/cmodule.py in module_from_key(self, key, lnk, keep_lock)
1162 try:
1163 location = dlimport_workdir(self.dirname)
-> 1164 module = lnk.compile_cmodule(location)
1165 name = module.__file__
1166 assert name.startswith(location)
~/github/software/Theano/theano/gof/cc.py in compile_cmodule(self, location)
1510 lib_dirs=self.lib_dirs(),
1511 libs=libs,
-> 1512 preargs=preargs)
1513 except Exception as e:
1514 e.args += (str(self.fgraph),)
~/github/software/Theano/theano/gof/cmodule.py in compile_str(module_name, src_code, location, include_dirs, lib_dirs, libs, preargs, py_module, hide_symbols)
2296
2297 try:
-> 2298 p_out = output_subprocess_Popen(cmd)
2299 compile_stderr = decode(p_out[1])
2300 except Exception:
~/github/software/Theano/theano/misc/windows.py in output_subprocess_Popen(command, **params)
75 params['stdout'] = subprocess.PIPE
76 params['stderr'] = subprocess.PIPE
---> 77 p = subprocess_Popen(command, **params)
78 # we need to use communicate to make sure we don't deadlock around
79 # the stdout/stderr pipe.
~/github/software/Theano/theano/misc/windows.py in subprocess_Popen(command, **params)
41
42 try:
---> 43 proc = subprocess.Popen(command, startupinfo=startupinfo, **params)
44 finally:
45 if stdin is not None:
~/anaconda/envs/bayesian/lib/python3.6/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
705 c2pread, c2pwrite,
706 errread, errwrite,
--> 707 restore_signals, start_new_session)
708 except:
709 # Cleanup if the child failed starting.
~/anaconda/envs/bayesian/lib/python3.6/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1258 errread, errwrite,
1259 errpipe_read, errpipe_write,
-> 1260 restore_signals, start_new_session, preexec_fn)
1261 self._child_created = True
1262 finally:
OSError: [Errno 12] Cannot allocate memory
I initially suspected that this was a Theano problem, so I've cross-posted there already (will update with more detail), but could there be something that I'm doing wrong here?
Any help's appreciated, but if you guys are too busy, no rush here!
Some more notes and updates.
First off, what my .theanorc looks like:
[global]
device = cuda
floatX = float32
[cuda]
root = /usr/local/cuda
Nothing fancy here, I think.
When using PyMC3 v3.1 from conda-forge, the kernel dies spontaneously whenever I'm trying to sample traces from the posterior distribution. This is consistent behaviour across all of the notebooks that I have in my Bayesian analysis recipes repository.
Switching to master branch of PyMC3 doesn't change things.
From prior experience, I suspected pygpu might be the culprit. I am currently using 0.6.8, but the latest version is 0.6.9. I updated pygpu to 0.6.9. Testing using the notebook linreg-professor-salary, ADVI init proceeds fine, and NUTS sampling is slow - regardless, I'm happy it's working.
I then switched to a different notebook to see if things were working okay - multiclass-logistic-regression-cover-type. Found out that I had accidentally re-installed conda-forge PyMC3 3.1, so I switched back to a master-branch development install of PyMC3. Sampling traces are fine, @junpenglao's latest fixes for multinomial are where the code broke - now facing ValueError: sum(pvals[:-1]) > 1.0 error.
I switched then to my multivariate_fix branch where I have the hack mentioned below implemented. Sampling went fine. Concluded that original memory errors did not show up.
With this, I went back to the linreg-professor-salary notebook. The notebook had no issues.
Confident that my "baseline" PyMC3 work could be done, I proceeded to try out the neural network notebook, multiclass-classification-neural-network. Previously, I always had issues with sampling traces, because the kernel would spontaneously die on me at that cell. To speed up the iteration cycle, I only did a partial ADVI fit (10,000 iterations). Sampling traces went fine, but I ran into the same old error as shown above.
Side note on multinomial:
float32 vs. `float64 error: https://github.com/numpy/numpy/issues/8317To eliminate a moving part, how do things go if you run this as a script, instead of in the notebook?
Hmmm, let me give that a shot.
Same issue shows up - OSError: [Errno 12] Cannot allocate memory. I've been working on the premise that my environment is messed up somehow, so I'm documenting what I'm doing in the comment above your first one, @ColCarroll.
Man, I'm feeling really defeated right now... Working with the GPU is really difficult...
FWIW, I am cross-linking this issue with the same post on Theano.
GPU computing is tricky. I take it the model runs fine on CPUs?
Smaller models are working fine. With the Bayesian NN model, when forcing usage of the CPU, the kernel dies spontaneously. I think it's a memory-related issue - Theano devs have responded.
I ran the MNIST Conv Net on a very large instance (128GB I think) as it does require a lot of RAM. @ericmjl have you tried the same model without pymc3?
@twiecki I haven't - I did forget to mention earlier on that I have successfully run the model within PyMC3 on my machine. I did something (cannot remember what) to my environment, and suddenly these memory issues cropped up. Rebuilding the environment from scratch gave me this memory issue.
I'll give your suggestion a shot.
Update my notes here for the benefit of everybody, in case someone else comes across the same issue. cc those who have partaken in the discussion: @twiecki, @fonnesbeck, @ColCarroll
I began manually profiling my Jupyter notebook code, chunk-by-chunk. It looks like my memory woes are being caused by the following line:
with model:
trace = approx.sample(5000)
Using that, all 16GB of my CPU RAM + some swap space was being consumed. I saw this by using the free command at the terminal.
Seeing as this was the issue, I hypothesized that the number of samples being drawn was the issue. Thus, I dropped the number of samples from 5K to 1K:
with model:
trace = approx.sample(1000)
And now my memory woes are gone, with only 6GB of RAM being consumed.
Now I'm a happy Bayesian :smile:, and might want to talk about the basics of Bayesian NNs next year at PyCon! :smiley: :smiley:
That said, it got me thinking - ~6GB of RAM consumption? Is this normal?
To @twiecki: do you happen to know if there's a rule-of-thumb quick way of calculating memory consumption for NNs?
So, you have a large model, resulting in a big preallocated trace? I guess this speaks to us still needing a non-RAM backend. Can you try seeing if 5k samples works with either a text or hdf5 trace? Would be nice to use dask for this.
Yep, I have encountered similar problems training NNs in pymc3.
One thing I have done in the past is separate my runs into multiple traces. I don't want to use the words "burn-in" and "thinning" for fear of inciting a flame-war, but those two knobs do allow micro-management of RAM usage.
with model:
trace = approx.sample(10)
trace = approx.sample(500)
Thanks for all the detail @ericmjl -- I have nothing helpful to add, but quick napkin math from looking at the notebook you posted: the model specification has six objects, whose sizes appear to be:
which means there are ~1 million np.float32 being stored for each sample. This comes out to ~4MB. This means 1,000 samples would be ~4GB, which is within an order of magnitude of your calculations (and explains why 5,000 samples would be pushing it on a 16GB machine).
@fonnesbeck: Thanks for the tip! I'm groping around in the dark w.r.t. how to pass in trace='text' - this is what I have:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-23-265b5ff3aeb0> in <module>()
1 with model:
----> 2 trace = approx.sample(1000, trace='text')
TypeError: sample() got an unexpected keyword argument 'trace'
I dug into the PyMC3 codebase - it looks like approx.sample comes from here, and doesn't have the trace kwarg.
Doing some manual tracing (no pun intended) through the PyMC3 codebase, I'm starting to feel genuinely lost here on how I could store the approx. traces to, say, and HDF5 file. Would you be kind enough to point me in the right direction?
@ColCarroll: Thanks too for helping me think through this! Yes, this now makes a lot of sense - each trace has to be one set of sampled parameter values.
@kyleabeauchamp: Thanks too for chiming in! I'll keep that tip in mind.
You鈥檇 want to import the HDF5 or Text backend and then:
trace = pm.sample(5000, trace=HDF5('trace.h5') )
@fonnesbeck: hmm, I've just tried that, but I get an error.
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-26-4c38acdde054> in <module>()
1 with model:
2 # trace = approx.sample(1000)
----> 3 trace = pm.sample(trace=HDF5('trace.h5'))
~/github/software/pymc3/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain, njobs, tune, nuts_kwargs, step_kwargs, progressbar, model, random_seed, live_plot, discard_tuned_samples, live_plot_kwargs, **kwargs)
278 discard = tune if discard_tuned_samples else 0
279
--> 280 return sample_func(**sample_args)[discard:]
281
282
~/github/software/pymc3/pymc3/sampling.py in _sample(draws, step, start, trace, chain, tune, progressbar, model, random_seed, live_plot, live_plot_kwargs, **kwargs)
293 try:
294 strace = None
--> 295 for it, strace in enumerate(sampling):
296 if live_plot:
297 if live_plot_kwargs is None:
~/anaconda/envs/bayesian/lib/python3.6/site-packages/tqdm-4.15.0-py3.6.egg/tqdm/_tqdm.py in __iter__(self)
870 """, fp_write=getattr(self.fp, 'write', sys.stderr.write))
871
--> 872 for obj in iterable:
873 yield obj
874 # Update and print the progressbar.
~/github/software/pymc3/pymc3/sampling.py in _iter_sample(draws, step, start, trace, chain, tune, model, random_seed)
393 point, states = step.step(point)
394 if strace.supports_sampler_stats:
--> 395 strace.record(point, states)
396 else:
397 strace.record(point)
~/github/software/pymc3/pymc3/backends/hdf5.py in record(self, point, sampler_stats)
176 data = self.stats[str(i)]
177 for key, val in vars.items():
--> 178 data[key][self.draw_idx] = val
179
180 self.draw_idx += 1
h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_objects.c:2846)()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_objects.c:2804)()
~/anaconda/envs/bayesian/lib/python3.6/site-packages/h5py/_hl/dataset.py in __setitem__(self, args, val)
628 mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
629 for fspace in selection.broadcast(mshape):
--> 630 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
631
632 def read_direct(self, dest, source_sel=None, dest_sel=None):
h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_objects.c:2846)()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_objects.c:2804)()
h5py/h5d.pyx in h5py.h5d.DatasetID.write (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/h5d.c:3700)()
h5py/_proxy.pyx in h5py._proxy.dset_rw (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_proxy.c:2028)()
h5py/_proxy.pyx in h5py._proxy.H5PY_H5Dwrite (/home/ilan/minonda/conda-bld/h5py_1496889914775/work/h5py/_proxy.c:1738)()
OSError: Can't prepare for writing data (No appropriate function for conversion path)
When switching to a text backend, ADVI init works, and NUTS sampling is very slow (expected, for a large model like this). There's only a single chain, and it was ~1.7GB in size. Memory usage kept climbing to 13GB, though much more slowly than with the previous problem. The kernel hung after a while as well. Not sure what's going on here.
Maybe this just causes an IO bottleneck. I wonder if dask would be faster.
Haven't seen the HDF5 error before. Perhaps others have.
The error in when using hdf5 is due to a bug in the stats for NUTS. Should be fixed by #2467.
Should this issue and associated Theano issue be closed? https://github.com/Theano/Theano/issues/6206
Will wait for @ericmjl to confirm, but yes, I think this is more of a PyMC3 feature request than a bug.
@ColCarroll @nouiz yes, please go ahead and close the Theano issue. I will close this one here.