I got this recently trying to tune the hyperparameters on an MLP.
Relevant versions:
python==3.7.1
ax-platform==0.1.2
botorch==0.1.0
gpytorch==0.3.2
scipy==1.1.0
torch==1.1.0
I'm using ax.optimize() as the entrypoint. It was 45 trials into the experiment. Here's the stack trace.
ax.service.managed_loop: Running optimization trial 45...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-37-5c288f8ea2dd> in <module>
76 ],
77 evaluation_function=do_train,
---> 78 minimize=True,
79 )
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in optimize(parameters, evaluation_function, experiment_name, objective_name, minimize, parameter_constraints, outcome_constraints, total_trials, arms_per_trial, wait_time)
204 wait_time=wait_time,
205 )
--> 206 loop.full_run()
207 parameterization, values = loop.get_best_point()
208 return parameterization, values, loop.experiment, loop.get_current_model()
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in full_run(self)
148 logger.info(f"Started full optimization with {num_steps} steps.")
149 for _ in range(num_steps):
--> 150 self.run_trial()
151 return self
152
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in run_trial(self)
128 trial = self.experiment.new_trial(
129 generator_run=self.generation_strategy.gen(
--> 130 experiment=self.experiment, new_data=dat
131 )
132 )
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/generation_strategy.py in gen(self, experiment, new_data, n, **kwargs)
161 elif new_data is not None:
162 # We're sticking with the current model, but update with new data
--> 163 self._model.update(experiment=experiment, data=new_data)
164
165 gen_run = not_none(self._model).gen(n=n, **(self._curr.model_gen_kwargs or {}))
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/base.py in update(self, data, experiment)
385 obs_feats = t.transform_observation_features(obs_feats)
386 obs_data = t.transform_observation_data(obs_data, obs_feats)
--> 387 self._update(observation_features=obs_feats, observation_data=obs_data)
388 self.fit_time += time.time() - t_update_start
389 self.fit_time_since_gen += time.time() - t_update_start
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/array.py in _update(self, observation_features, observation_data)
110 # Update in-design status for these new points.
111 self.training_in_design[-len(observation_features) :] = in_design
--> 112 self._model_update(Xs=Xs_array, Ys=Ys_array, Yvars=Yvars_array)
113
114 def _model_update(
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/torch.py in _model_update(self, Xs, Ys, Yvars)
113 Ys: List[Tensor] = self._array_list_to_tensors(Ys)
114 Yvars: List[Tensor] = self._array_list_to_tensors(Yvars)
--> 115 self.model.update(Xs=Xs, Ys=Ys, Yvars=Yvars)
116
117 def _model_predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
~/anaconda3/lib/python3.7/site-packages/ax/models/torch/botorch.py in update(self, Xs, Ys, Yvars)
372 Yvars=self.Yvars,
373 task_features=self.task_features,
--> 374 state_dict=state_dict,
375 )
~/anaconda3/lib/python3.7/site-packages/ax/models/torch/botorch_defaults.py in get_and_fit_model(Xs, Ys, Yvars, task_features, state_dict, **kwargs)
84 # pyre-ignore: [16]
85 mll = ExactMarginalLogLikelihood(model.likelihood, model)
---> 86 mll = fit_gpytorch_model(mll, bounds=bounds)
87 else:
88 model.load_state_dict(state_dict)
~/anaconda3/lib/python3.7/site-packages/botorch/fit.py in fit_gpytorch_model(mll, optimizer, **kwargs)
33 """
34 mll.train()
---> 35 mll, _ = optimizer(mll, track_iterations=False, **kwargs)
36 mll.eval()
37 return mll
~/anaconda3/lib/python3.7/site-packages/botorch/optim/fit.py in fit_gpytorch_scipy(mll, bounds, method, options, track_iterations)
186 jac=True,
187 options=options,
--> 188 callback=cb,
189 )
190 iterations = []
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
601 elif meth == 'l-bfgs-b':
602 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 603 callback=callback, **options)
604 elif meth == 'tnc':
605 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
333 # until the completion of the current minimization iteration.
334 # Overwrite f and g:
--> 335 f, g = func_and_grad(x)
336 elif task_str.startswith(b'NEW_X'):
337 # new iteration
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
283 else:
284 def func_and_grad(x):
--> 285 f = fun(x, *args)
286 g = jac(x, *args)
287 return f, g
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
291 def function_wrapper(*wrapper_args):
292 ncalls[0] += 1
--> 293 return function(*(wrapper_args + args))
294
295 return ncalls, function_wrapper
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
61 def __call__(self, x, *args):
62 self.x = numpy.asarray(x).copy()
---> 63 fg = self.fun(x, *args)
64 self.jac = fg[1]
65 return fg[0]
~/anaconda3/lib/python3.7/site-packages/botorch/optim/fit.py in _scipy_objective_and_grad(x, mll, property_dict)
221 output = mll.model(*train_inputs)
222 args = [output, train_targets] + _get_extra_mll_args(mll)
--> 223 loss = -mll(*args).sum()
224 loss.backward()
225 param_dict = OrderedDict(mll.named_parameters())
~/anaconda3/lib/python3.7/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
20
21 def __call__(self, *inputs, **kwargs):
---> 22 outputs = self.forward(*inputs, **kwargs)
23 if isinstance(outputs, list):
24 return [_validate_module_outputs(output) for output in outputs]
~/anaconda3/lib/python3.7/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py in forward(self, output, target, *params)
26 # Get the log prob of the marginal distribution
27 output = self.likelihood(output, *params)
---> 28 res = output.log_prob(target)
29
30 # Add terms for SGPR / when inducing points are learned
~/anaconda3/lib/python3.7/site-packages/gpytorch/distributions/multivariate_normal.py in log_prob(self, value)
127
128 # Get log determininat and first part of quadratic form
--> 129 inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
130
131 res = -0.5 * sum([inv_quad, logdet, diff.size(-1) * math.log(2 * math.pi)])
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in inv_quad_logdet(self, inv_quad_rhs, logdet, reduce_inv_quad)
990 from .chol_lazy_tensor import CholLazyTensor
991
--> 992 cholesky = CholLazyTensor(self.cholesky())
993 return cholesky.inv_quad_logdet(inv_quad_rhs=inv_quad_rhs, logdet=logdet, reduce_inv_quad=reduce_inv_quad)
994
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in cholesky(self, upper)
716 (LazyTensor) Cholesky factor (lower triangular)
717 """
--> 718 res = self._cholesky()
719 if upper:
720 res = res.transpose(-1, -2)
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/memoize.py in g(self, *args, **kwargs)
32 cache_name = name if name is not None else method
33 if not is_in_cache(self, cache_name):
---> 34 add_to_cache(self, cache_name, method(self, *args, **kwargs))
35 return get_from_cache(self, cache_name)
36
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in _cholesky(self)
401 evaluated_mat.register_hook(_ensure_symmetric_grad)
402
--> 403 cholesky = psd_safe_cholesky(evaluated_mat.double()).to(self.dtype)
404 return NonLazyTensor(cholesky)
405
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/cholesky.py in psd_safe_cholesky(A, upper, out, jitter)
45 continue
46
---> 47 raise e
48
49
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/cholesky.py in psd_safe_cholesky(A, upper, out, jitter)
19 """
20 try:
---> 21 L = torch.cholesky(A, upper=upper, out=out)
22 # TODO: Remove once fixed in pytorch (#16780)
23 if A.dim() > 2 and A.is_cuda:
RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.
Do you have code you could share that would help us to reproduce this issue?
This appears to be a numerical issue during model fitting. The error message is somewhat misleading, as it seems to me that this is caused by NaNs appearing in the covariance (if you look here, adding jitter to the matrix repeatedly is unsuccessful, which suggests there are NaNs in the matrix).
Assuming this is not caused by NaNs in the input data (since it ran fine for 45 steps), this sounds like a numerical issue during computation of the kernel matrix, possibly caused by extreme hyperparameter values during optimization.
As @kkashin said, having a fully repro-able code example would make it easier for us to debug. If you can't share one, would it at least be possible to share the state_dict of the model that is being fit when the error occurs so we can narrow things down? To do that, you can call mll.state_dict() on the mll in the part of your stack trace I reference below. That will give you an OrderedDict with the model parameters (should be small enough to just paste here).
~/anaconda3/lib/python3.7/site-packages/ax/models/torch/botorch_defaults.py in get_and_fit_model(Xs, Ys, Yvars, task_features, state_dict, **kwargs)
84 # pyre-ignore: [16]
85 mll = ExactMarginalLogLikelihood(model.likelihood, model)
---> 86 mll = fit_gpytorch_model(mll, bounds=bounds)
87 else:
88 model.load_state_dict(state_dict)
Hi Max! The model code isn't going to do much good without the dataset, which isn't straightforward to share. But I do have the experiment log, which I think should be better for reproducing the failure anyway, although it'll take a bit of mucking around to short-circuit the candidate generation. Hopefully there's enough precision in the logged objective values.
You'll see there are no NaN's in the objective nor hyperparameter values. But once (trial 31) the target model started to diverge during training, and I clip the objective value at 10 and report that to Ax. Is that too extreme and leading to the instability? (Looks like the cholesky is already in double precision, so I'd have guessed this range wasn't too big for it, but what do I know.)
best_parameters, best_values, experiment, model = ax.optimize(
total_trials=500,
parameters=[
{
"name": "lr",
"type": "range",
"bounds": [1e-6, 1e-3],
"log_scale": True,
},
{
"name": "wd",
"type": "range",
"bounds": [0.1, 1.0],
"log_scale": True,
},
{
"name": "layers",
"type": "range",
"bounds": [2, 3],
"value_type": "int",
},
{
"name": "alpha",
"type": "range",
"bounds": [0.1, 0.6],
},
{
"name": "N",
"type": "range",
"bounds": [100,200],
"value_type": "int",
"log_scale": True,
},
{
"name": "dim",
"type": "range",
"bounds": [2000, 5000],
"value_type": "int",
"log_scale": True,
},
{
"name": "gc",
"type": "range",
"bounds": [1, 100],
"log_scale": True,
},
{
"name": "gamma",
"type": "range",
"bounds": [0.1, 0.5],
},
{
"name": "ede",
"type": "range",
"bounds": [10, 100],
"value_type": "int",
},
],
evaluation_function=train,
minimize=True,
)
[INFO 02-08 11:50:15] ax.service.utils.dispatch: Using Bayesian Optimization generation strategy. Iterations after 11 will take longer to generate due to model-fitting.
[INFO 02-08 11:50:15] ax.service.managed_loop: Started full optimization with 500 steps.
[INFO 02-08 11:50:15] ax.service.managed_loop: Running optimization trial 1...
Value=0.004375 with config {'lr': 0.00016812216675290187, 'wd': 0.24963738873365873, 'layers': 3, 'alpha': 0.10984280183911324, 'N': 143, 'dim': 3707, 'gc': 59, 'gamma': 0.21202096939086915, 'ede': 40}
[INFO 02-08 11:55:31] ax.service.managed_loop: Running optimization trial 2...
Value=0.025925 with config {'lr': 0.0004690182965445481, 'wd': 0.7725450900816042, 'layers': 3, 'alpha': 0.37012011408805845, 'N': 179, 'dim': 3130, 'gc': 21, 'gamma': 0.36687912940979006, 'ede': 98}
[INFO 02-08 12:01:04] ax.service.managed_loop: Running optimization trial 3...
Value=0.014933 with config {'lr': 4.911478382596337e-06, 'wd': 0.1228580616490529, 'layers': 2, 'alpha': 0.2888772994279861, 'N': 116, 'dim': 4054, 'gc': 3, 'gamma': 0.1927550256252289, 'ede': 21}
[INFO 02-08 12:03:24] ax.service.managed_loop: Running optimization trial 4...
Value=0.021171 with config {'lr': 2.037301994352652e-06, 'wd': 0.7267731523624913, 'layers': 3, 'alpha': 0.507215803861618, 'N': 190, 'dim': 4645, 'gc': 3, 'gamma': 0.28858082294464116, 'ede': 63}
[INFO 02-08 12:14:41] ax.service.managed_loop: Running optimization trial 5...
Value=0.004488 with config {'lr': 0.0001945712694809502, 'wd': 0.14480669958895792, 'layers': 2, 'alpha': 0.21431311815977097, 'N': 103, 'dim': 2693, 'gc': 56, 'gamma': 0.4642613649368287, 'ede': 50}
[INFO 02-08 12:16:08] ax.service.managed_loop: Running optimization trial 6...
Value=0.011162 with config {'lr': 6.973778184712988e-05, 'wd': 0.4861745875646687, 'layers': 2, 'alpha': 0.454980856180191, 'N': 127, 'dim': 3189, 'gc': 15, 'gamma': 0.10681413114070892, 'ede': 89}
[INFO 02-08 12:18:13] ax.service.managed_loop: Running optimization trial 7...
Value=0.009781 with config {'lr': 6.310663950741618e-06, 'wd': 0.22566238789019408, 'layers': 3, 'alpha': 0.2859866380691528, 'N': 165, 'dim': 2462, 'gc': 1, 'gamma': 0.3340693712234497, 'ede': 31}
[INFO 02-08 12:22:07] ax.service.managed_loop: Running optimization trial 8...
Value=0.009211 with config {'lr': 9.240889965939464e-06, 'wd': 0.9187075040656306, 'layers': 2, 'alpha': 0.16848297268152237, 'N': 149, 'dim': 2272, 'gc': 7, 'gamma': 0.13546110689640045, 'ede': 59}
[INFO 02-08 12:23:56] ax.service.managed_loop: Running optimization trial 9...
Value=0.006501 with config {'lr': 4.299508367078726e-05, 'wd': 0.10296312057567615, 'layers': 3, 'alpha': 0.4943604230880737, 'N': 137, 'dim': 3494, 'gc': 86, 'gamma': 0.3111355543136597, 'ede': 49}
[INFO 02-08 12:28:59] ax.service.managed_loop: Running optimization trial 10...
Value=0.012924 with config {'lr': 0.00031738759896088434, 'wd': 0.33310587510050016, 'layers': 3, 'alpha': 0.23805130124092103, 'N': 110, 'dim': 2612, 'gc': 30, 'gamma': 0.26618983745574953, 'ede': 79}
[INFO 02-08 12:31:42] ax.service.managed_loop: Running optimization trial 11...
Value=0.120254 with config {'lr': 1.3992052486671128e-06, 'wd': 0.27486517977298136, 'layers': 2, 'alpha': 0.43660125732421873, 'N': 169, 'dim': 4770, 'gc': 2, 'gamma': 0.49343898296356203, 'ede': 24}
[INFO 02-08 12:35:51] ax.service.managed_loop: Running optimization trial 12...
Value=0.011218 with config {'lr': 9.290986021046619e-05, 'wd': 0.17477746945297995, 'layers': 3, 'alpha': 0.2855728537035479, 'N': 135, 'dim': 3394, 'gc': 56, 'gamma': 0.2617219644488902, 'ede': 48}
[INFO 02-08 12:40:51] ax.service.managed_loop: Running optimization trial 13...
Value=0.004440 with config {'lr': 0.00016757796508636206, 'wd': 0.1713472205808336, 'layers': 2, 'alpha': 0.23367625583650356, 'N': 110, 'dim': 2817, 'gc': 51, 'gamma': 0.36678336754007435, 'ede': 54}
[INFO 02-08 12:42:29] ax.service.managed_loop: Running optimization trial 14...
Value=0.009289 with config {'lr': 0.000173630632090083, 'wd': 0.16237863097334462, 'layers': 2, 'alpha': 0.2265017636764704, 'N': 109, 'dim': 2813, 'gc': 53, 'gamma': 0.3933884598402293, 'ede': 52}
[INFO 02-08 12:44:05] ax.service.managed_loop: Running optimization trial 15...
Value=0.007604 with config {'lr': 0.00013911645240162097, 'wd': 0.2285218231412646, 'layers': 2, 'alpha': 0.27325710798757974, 'N': 109, 'dim': 2676, 'gc': 38, 'gamma': 0.2646792574082898, 'ede': 67}
[INFO 02-08 12:45:38] ax.service.managed_loop: Running optimization trial 16...
Value=0.011241 with config {'lr': 0.0002394903493581587, 'wd': 0.1537487953601106, 'layers': 2, 'alpha': 0.23944226634640792, 'N': 100, 'dim': 2180, 'gc': 48, 'gamma': 0.4948677564362487, 'ede': 65}
[INFO 02-08 12:46:52] ax.service.managed_loop: Running optimization trial 17...
Value=0.007165 with config {'lr': 0.00016044935066810036, 'wd': 0.1784394550193929, 'layers': 2, 'alpha': 0.23795998699953563, 'N': 119, 'dim': 3129, 'gc': 56, 'gamma': 0.26135988071644695, 'ede': 51}
[INFO 02-08 12:48:49] ax.service.managed_loop: Running optimization trial 18...
Value=0.021068 with config {'lr': 0.0001686913885547563, 'wd': 0.18122427747814504, 'layers': 2, 'alpha': 0.25182998358340325, 'N': 100, 'dim': 2845, 'gc': 52, 'gamma': 0.32023837609743855, 'ede': 57}
[INFO 02-08 12:50:18] ax.service.managed_loop: Running optimization trial 19...
Value=0.012727 with config {'lr': 0.00014020619070111062, 'wd': 0.21021866483658874, 'layers': 2, 'alpha': 0.2456511506690128, 'N': 122, 'dim': 2625, 'gc': 38, 'gamma': 0.34240937299584073, 'ede': 63}
[INFO 02-08 12:52:02] ax.service.managed_loop: Running optimization trial 20...
Value=0.002248 with config {'lr': 0.00019158505926825608, 'wd': 0.3191209072814306, 'layers': 3, 'alpha': 0.10914458310743344, 'N': 130, 'dim': 3291, 'gc': 39, 'gamma': 0.19877066657801515, 'ede': 50}
[INFO 02-08 12:56:33] ax.service.managed_loop: Running optimization trial 21...
Value=0.003993 with config {'lr': 0.00017220270187833796, 'wd': 0.29505540130859487, 'layers': 3, 'alpha': 0.12210848096493117, 'N': 131, 'dim': 3367, 'gc': 43, 'gamma': 0.19915110420326226, 'ede': 48}
[INFO 02-08 13:01:19] ax.service.managed_loop: Running optimization trial 22...
Value=0.004662 with config {'lr': 0.00028300183676618256, 'wd': 0.4057448401728478, 'layers': 3, 'alpha': 0.1, 'N': 143, 'dim': 2994, 'gc': 18, 'gamma': 0.2310557362053592, 'ede': 49}
[INFO 02-08 13:05:45] ax.service.managed_loop: Running optimization trial 23...
Value=0.005610 with config {'lr': 0.0005998066738977613, 'wd': 0.43664878160217324, 'layers': 3, 'alpha': 0.1, 'N': 135, 'dim': 3359, 'gc': 58, 'gamma': 0.21639082414406885, 'ede': 60}
[INFO 02-08 13:10:44] ax.service.managed_loop: Running optimization trial 24...
Value=0.002151 with config {'lr': 0.00016349024039665071, 'wd': 0.4093790830808904, 'layers': 3, 'alpha': 0.10000000000000002, 'N': 130, 'dim': 3047, 'gc': 24, 'gamma': 0.19198278127393764, 'ede': 56}
[INFO 02-08 13:14:48] ax.service.managed_loop: Running optimization trial 25...
Value=0.002478 with config {'lr': 0.0003669777895469256, 'wd': 0.3395797711237239, 'layers': 3, 'alpha': 0.10000000000000003, 'N': 128, 'dim': 3058, 'gc': 24, 'gamma': 0.21886699252241493, 'ede': 46}
[INFO 02-08 13:18:51] ax.service.managed_loop: Running optimization trial 26...
Value=0.002177 with config {'lr': 0.00019359264829855308, 'wd': 0.3331508022458828, 'layers': 3, 'alpha': 0.10000000000000003, 'N': 132, 'dim': 3082, 'gc': 31, 'gamma': 0.21065199227788203, 'ede': 53}
[INFO 02-08 13:23:12] ax.service.managed_loop: Running optimization trial 27...
Value=0.001935 with config {'lr': 0.0002249607423079569, 'wd': 0.39715522093452976, 'layers': 3, 'alpha': 0.1, 'N': 131, 'dim': 3374, 'gc': 26, 'gamma': 0.1950106131983554, 'ede': 47}
[INFO 02-08 13:28:05] ax.service.managed_loop: Running optimization trial 28...
Value=0.002201 with config {'lr': 0.00025348225203233676, 'wd': 0.36154320304757553, 'layers': 3, 'alpha': 0.1, 'N': 128, 'dim': 3077, 'gc': 33, 'gamma': 0.1842880559185543, 'ede': 51}
[INFO 02-08 13:32:16] ax.service.managed_loop: Running optimization trial 29...
Value=0.002305 with config {'lr': 0.00015100386977521494, 'wd': 0.37957857879026136, 'layers': 3, 'alpha': 0.10000000000000007, 'N': 125, 'dim': 3452, 'gc': 27, 'gamma': 0.22294412798357982, 'ede': 57}
[INFO 02-08 13:36:56] ax.service.managed_loop: Running optimization trial 30...
Value=0.002271 with config {'lr': 0.00017033384806033132, 'wd': 0.368011967003343, 'layers': 3, 'alpha': 0.1, 'N': 130, 'dim': 3271, 'gc': 28, 'gamma': 0.21109117014112871, 'ede': 51}
[INFO 02-08 13:41:30] ax.service.managed_loop: Running optimization trial 31...
Diverging!
Value=10.000000 with config {'lr': 0.0004586489386597981, 'wd': 0.22697027254125698, 'layers': 3, 'alpha': 0.1, 'N': 132, 'dim': 3766, 'gc': 12, 'gamma': 0.18220847777017718, 'ede': 69}
[INFO 02-08 13:41:32] ax.service.managed_loop: Running optimization trial 32...
Value=0.006483 with config {'lr': 0.0005530533855718001, 'wd': 0.38879644863833657, 'layers': 3, 'alpha': 0.13332081406255641, 'N': 127, 'dim': 2783, 'gc': 25, 'gamma': 0.3094881322584204, 'ede': 47}
[INFO 02-08 13:45:06] ax.service.managed_loop: Running optimization trial 33...
Value=0.018802 with config {'lr': 0.0005284416542020141, 'wd': 0.34039303161192414, 'layers': 3, 'alpha': 0.1899904330273221, 'N': 119, 'dim': 3009, 'gc': 45, 'gamma': 0.27469404047122525, 'ede': 69}
[INFO 02-08 13:48:49] ax.service.managed_loop: Running optimization trial 34...
Value=0.004636 with config {'lr': 0.00013619116036606052, 'wd': 0.22997258245835236, 'layers': 3, 'alpha': 0.23773381574636718, 'N': 127, 'dim': 3081, 'gc': 57, 'gamma': 0.26157413613817126, 'ede': 50}
[INFO 02-08 13:53:05] ax.service.managed_loop: Running optimization trial 35...
Value=0.003142 with config {'lr': 0.0003448995183295857, 'wd': 0.34022601388333334, 'layers': 3, 'alpha': 0.17948034619966718, 'N': 105, 'dim': 2869, 'gc': 32, 'gamma': 0.2365326544543242, 'ede': 51}
[INFO 02-08 13:56:12] ax.service.managed_loop: Running optimization trial 36...
Value=0.006068 with config {'lr': 0.00014723365944109825, 'wd': 0.20523271741741247, 'layers': 2, 'alpha': 0.25104982024634753, 'N': 122, 'dim': 2802, 'gc': 52, 'gamma': 0.2918347632375946, 'ede': 57}
[INFO 02-08 13:58:05] ax.service.managed_loop: Running optimization trial 37...
Value=0.001938 with config {'lr': 0.0003680283011842956, 'wd': 0.4859054797974123, 'layers': 3, 'alpha': 0.1270701573606817, 'N': 153, 'dim': 2923, 'gc': 19, 'gamma': 0.25227222945965627, 'ede': 46}
[INFO 02-08 14:02:40] ax.service.managed_loop: Running optimization trial 38...
Value=0.006537 with config {'lr': 0.00014199957460250102, 'wd': 0.19536846022823023, 'layers': 2, 'alpha': 0.24464645066868562, 'N': 122, 'dim': 2993, 'gc': 54, 'gamma': 0.2864764767996054, 'ede': 53}
[INFO 02-08 14:04:37] ax.service.managed_loop: Running optimization trial 39...
Value=0.003406 with config {'lr': 0.00012312712315662428, 'wd': 0.3626589693533911, 'layers': 3, 'alpha': 0.10002004968904474, 'N': 126, 'dim': 2876, 'gc': 20, 'gamma': 0.2333923990668454, 'ede': 53}
[INFO 02-08 14:08:20] ax.service.managed_loop: Running optimization trial 40...
Value=0.002471 with config {'lr': 0.00012707189462705774, 'wd': 0.4040087503156827, 'layers': 3, 'alpha': 0.100001301280061, 'N': 128, 'dim': 2773, 'gc': 20, 'gamma': 0.26491356409869293, 'ede': 36}
[INFO 02-08 14:11:54] ax.service.managed_loop: Running optimization trial 41...
Value=0.002709 with config {'lr': 0.00010387476249879035, 'wd': 0.46491324487855373, 'layers': 3, 'alpha': 0.11674231008338423, 'N': 121, 'dim': 3386, 'gc': 38, 'gamma': 0.25069428297551644, 'ede': 55}
[INFO 02-08 14:16:28] ax.service.managed_loop: Running optimization trial 42...
Value=0.004066 with config {'lr': 0.00023946640214646476, 'wd': 0.3013734922727032, 'layers': 3, 'alpha': 0.16877517752400994, 'N': 140, 'dim': 3724, 'gc': 52, 'gamma': 0.2499769143010905, 'ede': 57}
[INFO 02-08 14:22:24] ax.service.managed_loop: Running optimization trial 43...
Value=0.018601 with config {'lr': 0.00047844043256099804, 'wd': 0.45394435260736493, 'layers': 3, 'alpha': 0.31648548752116085, 'N': 152, 'dim': 3330, 'gc': 25, 'gamma': 0.264615965113727, 'ede': 83}
[INFO 02-08 14:27:59] ax.service.managed_loop: Running optimization trial 44...
Value=0.003107 with config {'lr': 9.427714022378116e-05, 'wd': 0.44785228274055144, 'layers': 3, 'alpha': 0.10005444436588568, 'N': 127, 'dim': 2779, 'gc': 20, 'gamma': 0.2401902979582926, 'ede': 47}
[INFO 02-08 14:31:39] ax.service.managed_loop: Running optimization trial 45...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
...
Aha, the plot thickens. Ax by default standardizes the outcome values internally (zero mean, unit variance) so that the default hyperparameter priors work for different scales. Clamping the objective value to a 10 (which is about two orders of magnitude larger than the largest other observation) could well lead to this issue during model fitting, in particular since by default we warm-start the model parameters from the ones of the fitted model in the previous iteration. Optimized hyperparameters for a model evaluated on data with vastly different objective values could end up causing numerical issues in the solves that result in NaNs.
Let me look into this in more detail tomorrow using the parameter and objective values from the log. In the meantime, as a sanity check you could try choosing a much smaller clamp value for now (maybe something around 0.5-1 or so).
Ultimately such failures like the model diverging should be handled in a special fashion (e.g. marking the trail as "failed" and possibly automatically backing off from the bounds), but that will require some additional work. In the diverging trial, do you notice anything particular w.r.t the parameters (close to the boundary of the design space)?
@leopd, I took a closer look at this and wasn't able to reproduce the fitting issues (though I can't rule out that I messed up in trying to re-create the experiment state from the logs, I'll ask some other folks to double-check this).
Is this issue reproducible on your end?
Here is the notebook that I used, let me know if this looks sane to you: debug_leos_fitting_issue.ipynb.txt
Aside:
Above I stated that
Optimized hyperparameters for a model evaluated on data with vastly different objective values could end up causing numerical issues in the solves that result in NaNs.
This argument doesn't convince me anymore, since the clamping happens much earlier (iteration 31) than the failure in the model fitting. If this were the cause, one would expect this to happen once the model gets refit on the outlier value.
@Balandat it hasn't happened again since I lowered the clamping value down to 1. Also, I'm not using that blackbox function any more as there was a bug in the data pre-processing stage. That was causing the good objective values to be all very close together, which is consistent with the theory that the scaling of the objectives was the problem. Although the timing of the error is odd (iteration 31 vs 45). Since the jittering is stochastic, maybe it's just bad luck -- we should configure the jitter warnings to show up in the logs, because if the jitter warnings start showing up right after 31 and just happen to go past 3 tries on 45, that's a strong signal.
we should configure the jitter warnings to show up in the logs
That's a good suggestion, let me see what the best way of doing this would be. One simple thing we could do to ensure that default output doesn't get cluttered is just log it on the info or debug level.
I think the necessary logging code is already in place https://github.com/cornellius-gp/gpytorch/blob/master/gpytorch/utils/cholesky.py#L42 but I'm honestly not familiar enough with the warnings package to know how to configure it to show up.
If this warning doesn't show for you, it must be filtered by the filters defined in warnings.filters. To override this for this particular message you can do
warnings.filterwarnings(
"always", message="^A not p.d., added jitter", category=RuntimeWarning
)
which makes sure that all RuntimeWarning messages that start with "A not p.d., added jitter" are raised.
Hi @leopd , let us know if you are still having issues or how we can help. I also wonder if this is something @jacobrgardner can help with, but it's kind of tough without having a reproducible example. Is it possible for you to save out the data for your design points and observed Ys so that we can plug it into gpytorch without all of your other code? I am also curious how many design points produce "outlier" values.
@bletham, @kkashin , @Balandat, I wonder if it would be worth creating some kind of utility function to catch exceptions in ax and then dump the observed data in some format to make it easier to repro (particularly wrt modeling errors like this one).
This is being worked on on the BoTorch end: https://github.com/pytorch/botorch/issues/179
While it's not Ax-specific, I'll leave the issue open for now until this is solved upstream.
We have released BoTorch 0.1.1, which should fix most of these issues numerical issues.
Is there a way to have Ax just halt and report the best results up to that point?
@igorrivin We're looking at more graceful halting when we encounter this, but if you're using the developer API you'll be able to access all completed results with no problem.
Thanks @2timesjay ! @igorrivin the Service API should work for you as well. It's only the Loop API in which you won't easily be able to see best results in the case of a crash (but we'll fix this).
Most helpful comment
@Balandat it hasn't happened again since I lowered the clamping value down to 1. Also, I'm not using that blackbox function any more as there was a bug in the data pre-processing stage. That was causing the good objective values to be all very close together, which is consistent with the theory that the scaling of the objectives was the problem. Although the timing of the error is odd (iteration 31 vs 45). Since the jittering is stochastic, maybe it's just bad luck -- we should configure the jitter warnings to show up in the logs, because if the jitter warnings start showing up right after 31 and just happen to go past 3 tries on 45, that's a strong signal.