I found that as long as I have run PyMC3 on GPU once, regardless of how I configure Theano (through .theanorc or using os.environ['THEANO_FLAGS'] = 'device=cpu,floatX=float32' to revert to CPU, as long as PyGPU is installed, as soon as I declare a PyMC3 model with tensor creation (e.g. W = pm.Normal("weights", mu=0, sd=1, shape=(n_types, M))), Theano will automatically switch to GPU mode with the following message:
Using cuDNN version 5110 on context None
Uninstalling PyGPU forces Theano to use CPU again.
Tested with master (33c8bff) version of PyMC3.
Env packages:
blas 1.1 openblas conda-forge
bleach 1.5.0 py36_0
certifi 2016.2.28 py36_0
cycler 0.10.0 py36_0
dbus 1.10.20 0
decorator 4.1.2 py36_0
entrypoints 0.2.3 py36_0
expat 2.1.0 0
fontconfig 2.12.1 3
freetype 2.5.5 2
glib 2.50.2 1
gst-plugins-base 1.8.0 0
gstreamer 1.8.0 0
h5py 2.7.1 <pip>
html5lib 0.9999999 py36_0
icu 54.1 0
ipykernel 4.6.1 py36_0
ipython 6.1.0 py36_0
ipython_genutils 0.2.0 py36_0
ipywidgets 6.0.0 py36_0
jbig 2.1 0
jedi 0.10.2 py36_2
jinja2 2.9.6 py36_0
joblib 0.11 <pip>
jpeg 9b 0
jsonschema 2.6.0 py36_0
jupyter 1.0.0 py36_3
jupyter_client 5.1.0 py36_0
jupyter_console 5.2.0 py36_0
jupyter_core 4.3.0 py36_0
libffi 3.2.1 1
libgcc 5.2.0 0
libgfortran 3.0.0 1
libgpuarray 0.6.9 0
libiconv 1.14 0
libpng 1.6.30 1
libsodium 1.0.10 0
libtiff 4.0.6 3
libxcb 1.12 1
libxml2 2.9.4 0
mako 1.0.6 py36_0
markupsafe 1.0 py36_0
matplotlib 2.0.2 np113py36_0
mistune 0.7.4 py36_0
mkl 2017.0.3 0
mpmath 0.19 py36_1
nbconvert 5.2.1 py36_0
nbformat 4.4.0 py36_0
nose 1.3.7 py36_1
notebook 5.0.0 py36_0
numpy 1.13.1 py36_blas_openblas_200 [blas_openblas] conda-forge
olefile 0.44 py36_0
openblas 0.2.19 0
openssl 1.0.2l 0
pandas 0.20.3 py36_1 conda-forge
pandocfilters 1.4.2 py36_0
path.py 10.3.1 py36_0
patsy 0.4.1 py36_0 conda-forge
patsy 0.4.1 <pip>
pcre 8.39 1
pexpect 4.2.1 py36_0
pickleshare 0.7.4 py36_0
pillow 4.2.1 py36_0
pip 9.0.1 py36_1
prompt_toolkit 1.0.15 py36_0
ptyprocess 0.5.2 py36_0
pygments 2.2.0 py36_0
pygpu 0.6.9 py36_0
pymc3 3.1 <pip>
pyparsing 2.2.0 py36_0
pyqt 5.6.0 py36_2
python 3.6.2 0
python-dateutil 2.6.1 py36_0
pytz 2017.2 py36_0
pyzmq 16.0.2 py36_0
qt 5.6.2 5
qtconsole 4.3.1 py36_0
readline 6.2 2
scikit-learn 0.19.0 py36_blas_openblas_201 [blas_openblas] conda-forge
scipy 0.19.1 py36_blas_openblas_202 [blas_openblas] conda-forge
seaborn 0.8.1 py36_0 conda-forge
setuptools 36.4.0 py36_0
simplegeneric 0.8.1 py36_1
sip 4.18 py36_0
six 1.10.0 py36_0
sqlite 3.13.0 0
statsmodels 0.8.0 np113py36_0 conda-forge
sympy 1.1.1 py36_0
terminado 0.6 py36_0
testpath 0.3.1 py36_0
Theano 0.9.0 <pip>
tk 8.5.18 0
tornado 4.5.2 py36_0
tqdm 4.15.0 <pip>
traitlets 4.3.2 py36_0
wcwidth 0.1.7 py36_0
wheel 0.29.0 py36_0
widgetsnbextension 3.0.2 py36_0
xz 5.2.3 0
zeromq 4.1.5 0
zlib 1.2.11 0
Also, if I set device=cuda in .theanorc, PyMC3 raises assertion error when sampling (possible double device initialisation conflict?). This error does not occur if I set device=cpu and let PyMC3 (or is it PyGPU?) default to the GPU.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-14-f6087a11b3d5> in <module>()
1 with model:
----> 2 trace = pm.sample(draws=5000, tune=2000)
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain, njobs, tune, nuts_kwargs, step_kwargs, progressbar, model, random_seed, live_plot, discard_tuned_samples, live_plot_kwargs, **kwargs)
251 start_, step = init_nuts(init=init, njobs=njobs, n_init=n_init,
252 model=model, random_seed=random_seed,
--> 253 progressbar=progressbar, **args)
254 if start is None:
255 start = start_
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/sampling.py in init_nuts(init, njobs, n_init, model, random_seed, progressbar, **kwargs)
878 raise NotImplementedError('Initializer {} is not supported.'.format(init))
879
--> 880 step = pm.NUTS(potential=potential, **kwargs)
881
882 return start, step
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/step_methods/hmc/nuts.py in __init__(self, vars, Emax, target_accept, gamma, k, t0, adapt_step_size, max_treedepth, on_error, early_max_treedepth, **kwargs)
153 `pm.sample` to the desired number of tuning steps.
154 """
--> 155 super(NUTS, self).__init__(vars, **kwargs)
156 self.Emax = Emax
157
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/step_methods/hmc/base_hmc.py in __init__(self, vars, scaling, step_scale, is_cov, model, blocked, potential, integrator, dtype, **theano_kwargs)
44
45 super(BaseHMC, self).__init__(vars, blocked=blocked, model=model,
---> 46 dtype=dtype, **theano_kwargs)
47
48 size = self._logp_dlogp_func.size
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/step_methods/arraystep.py in __init__(self, vars, model, blocked, dtype, **theano_kwargs)
166
167 self._logp_dlogp_func = model.logp_dlogp_function(
--> 168 vars, dtype=dtype, **theano_kwargs)
169
170 def step(self, point):
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/model.py in logp_dlogp_function(self, grad_vars, **kwargs)
656 varnames = [var.name for var in grad_vars]
657 extra_vars = [var for var in self.free_RVs if var.name not in varnames]
--> 658 return ValueGradFunction(self.logpt, grad_vars, extra_vars, **kwargs)
659
660 @property
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/pymc3/model.py in __init__(self, cost, grad_vars, extra_vars, dtype, casting, **kwargs)
403
404 self._theano_function = theano.function(
--> 405 inputs, [self._cost_joined, grad], givens=givens, **kwargs)
406
407 def set_extra_values(self, extra_vars):
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/compile/function.py in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
324 on_unused_input=on_unused_input,
325 profile=profile,
--> 326 output_keys=output_keys)
327 # We need to add the flag check_aliased inputs if we have any mutable or
328 # borrowed used defined inputs
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/compile/pfunc.py in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
484 accept_inplace=accept_inplace, name=name,
485 profile=profile, on_unused_input=on_unused_input,
--> 486 output_keys=output_keys)
487
488
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/compile/function_module.py in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1792 profile=profile,
1793 on_unused_input=on_unused_input,
-> 1794 output_keys=output_keys).create(
1795 defaults)
1796
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/compile/function_module.py in __init__(self, inputs, outputs, mode, accept_inplace, function_builder, profile, on_unused_input, fgraph, output_keys)
1472 optimizer, inputs, outputs)
1473 else:
-> 1474 optimizer_profile = optimizer(fgraph)
1475
1476 end_optimizer = time.time()
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in __call__(self, fgraph)
96
97 """
---> 98 return self.optimize(fgraph)
99
100 def add_requirements(self, fgraph):
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in optimize(self, fgraph, *args, **kwargs)
85 orig = theano.tensor.basic.constant.enable
86 theano.tensor.basic.constant.enable = False
---> 87 ret = self.apply(fgraph, *args, **kwargs)
88 finally:
89 theano.tensor.basic.constant.enable = orig
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in apply(self, fgraph)
233 nb_nodes_before = len(fgraph.apply_nodes)
234 t0 = time.time()
--> 235 sub_prof = optimizer.optimize(fgraph)
236 l.append(float(time.time() - t0))
237 sub_profs.append(sub_prof)
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in optimize(self, fgraph, *args, **kwargs)
85 orig = theano.tensor.basic.constant.enable
86 theano.tensor.basic.constant.enable = False
---> 87 ret = self.apply(fgraph, *args, **kwargs)
88 finally:
89 theano.tensor.basic.constant.enable = orig
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in apply(self, fgraph)
233 nb_nodes_before = len(fgraph.apply_nodes)
234 t0 = time.time()
--> 235 sub_prof = optimizer.optimize(fgraph)
236 l.append(float(time.time() - t0))
237 sub_profs.append(sub_prof)
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/opt.py in optimize(self, fgraph, *args, **kwargs)
85 orig = theano.tensor.basic.constant.enable
86 theano.tensor.basic.constant.enable = False
---> 87 ret = self.apply(fgraph, *args, **kwargs)
88 finally:
89 theano.tensor.basic.constant.enable = orig
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gpuarray/opt.py in apply(self, fgraph)
380 new_ops = lopt.transform(node.op, context_name,
381 [mapping[i] for i in node.inputs],
--> 382 node.outputs)
383 t_opt2 = time.time()
384 time_opts[lopt] += t_opt2 - t_opt
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gpuarray/opt.py in local_gpua_gemmbatch(op, context_name, inputs, outputs)
1187 a, b = inputs
1188 c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
-> 1189 return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
1190
1191
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gof/op.py in __call__(self, *inputs, **kwargs)
613 """
614 return_list = kwargs.pop('return_list', False)
--> 615 node = self.make_node(*inputs, **kwargs)
616
617 if config.compute_test_value != 'off':
~/anaconda3/envs/pymc3/lib/python3.6/site-packages/theano/gpuarray/blas.py in make_node(self, C, alpha, A, B, beta)
389 assert alpha.ndim == 0
390 assert beta.ndim == 0
--> 391 assert A.ndim == 3
392 assert B.ndim == 3
393 assert C.ndim == 3
AssertionError:
Upon further inspection, I think I can narrow down the possible issues a little bit:
device=cuda is likely caused by an unrelated issue - NUTS with njobs > 1 does not seem to work on GPU for me. I encountered GPUArrayException: cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s): CUDA_ERROR_INVALID_VALUE: invalid argumentI also have the GPUArrayException with njobs >1.
do you have multiple GPUs?
no, just one.
So njobs will spawn multiple chains to run in parallel. If the model uses the GPU there will be a conflict. We recently added nchains where you can still run multiple chains. So I think running pm.sample(niter, nchains=4, njobs=1) should give you what you want.
Just tried setting nchains=4, nchains=10, and nchains=1 and no matter what it samples 2 chains with 1 job.
Here is my setup:
2018-01-30T14:46:29-07:00
CPython 3.6.4
IPython 6.2.1
compiler : GCC 7.2.0
system : Linux
release : 4.13.0-32-generic
machine : x86_64
processor : x86_64
CPU cores : 12
interpreter: 64bit
/media/bcr/HDD/anaconda3/envs/bayes_dash/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
from pandas.core import datetools
Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 980 Ti (0000:01:00.0)
pandas 0.22.0
pandas_datareader 0.5.0
numpy 1.13.3
pymc3 3.3
theano 1.0.1
sklearn 0.19.0
statsmodels 0.8.0
scipy 0.19.1
matplotlib 2.1.1
seaborn 0.8.1
cufflinks 0.12.1
plotly 2.2.2
pyarrow 0.7.1
Here is my code. In this example equity is simply a pandas series of portfolio values with a starting value of 100,000.
%%time
simp_rets = shared(equity.pct_change().dropna().values)
with pm.Model() as model:
mu = pm.Normal('mean rets', mu=0, sd=.01, testval=simp_rets.mean())
sigma = pm.HalfCauchy('vol', beta=1, testval=simp_rets.std())
returns = pm.Normal('returns', mu=mu, sd=sigma, observed=simp_rets)
pm.Deterministic(
'annual volatility',
returns.distribution.variance**.5 *
np.sqrt(252))
pm.Deterministic(
'sharpe',
returns.distribution.mean /
returns.distribution.variance**.5 *
np.sqrt(252))
step = pm.NUTS()
trace = pm.sample(1000, tune=1000, step=step, nchains=1, njobs=1)
Here is the output:
Sequential sampling (2 chains in 1 job)
NUTS: [vol_log__, mean rets]
100%|ββββββββββ| 2000/2000 [00:04<00:00, 487.45it/s]
100%|ββββββββββ| 2000/2000 [00:02<00:00, 756.20it/s]
CPU times: user 7.57 s, sys: 312 ms, total: 7.89 s
Wall time: 7.94 s
fyi the number of chain kwarg is chains=k but not nchains=k.
@junpenglao thanks for the clarification.
Does it work with that?
On Thu, Feb 1, 2018 at 6:06 PM, Brian notifications@github.com wrote:
@junpenglao https://github.com/junpenglao thanks for the clarification.
β
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/pymc-devs/pymc3/issues/2546#issuecomment-362334116,
or mute the thread
https://github.com/notifications/unsubscribe-auth/AApJmHps8kwFWy3AbPNFq8bsbD1sDo6xks5tQe8XgaJpZM4POycR
.
yes chains=k does work.
@twiecki It Really Really WorksοΌ Thanks so much!
Most helpful comment
So
njobswill spawn multiple chains to run in parallel. If the model uses the GPU there will be a conflict. We recently addednchainswhere you can still run multiple chains. So I think runningpm.sample(niter, nchains=4, njobs=1)should give you what you want.