Hello.
I am trying to get two gpu's to fit a keras model. The model is copied from https://keras.io/utils/#multi_gpu_model. The keras script is listed at the very bottom.
I am getting an OOM Error. The error message is below. Both gpu's have been found.
I am using tf 1.5.0 and tf-gpu 1.6.0.
I am using keras 2.1.4.
I am running on Windows 7.
The following script:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
Returns:
['/job:localhost/replica:0/task:0/device:GPU:0',
'/job:localhost/replica:0/task:0/device:GPU:1']
Is there a way to avoid the OOM error?
Thanks,
Steve
Error Message:
Epoch 1/20
ResourceExhaustedError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, args)
1360 try:
-> 1361 return fn(args)
1362 except errors.OpError as e:
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1339 return tf_session.TF_Run(session, options, feed_dict, fetch_list,
-> 1340 target_list, status, run_metadata)
1341
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
515 compat.as_text(c_api.TF_Message(self.status.status)),
--> 516 c_api.TF_GetCode(self.status.status))
517 # Delete the underlying status object from memory otherwise it stays alive
ResourceExhaustedError: OOM when allocating tensor with shape[128,128,109,109] and type float on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
[[Node: replica_1/xception/block2_sepconv1_bn/FusedBatchNorm = FusedBatchNormT=DT_FLOAT, data_format="NHWC", epsilon=0.001, is_training=true, _device="/job:localhost/replica:0/task:0/device:GPU:1"]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[Node: training/RMSprop/gradients/replica_0/xception/block7_sepconv2_bn/cond/batchnorm/mul_grad/Mul_1/_2981 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_16749...grad/Mul_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
During handling of the above exception, another exception occurred:
ResourceExhaustedError Traceback (most recent call last)
5 # This fit call will be distributed on 2 GPUs.
6 # Since the batch size is 256, each GPU will process 32 samples.
----> 7 parallel_model.fit(x, y, epochs=20, batch_size=256)
8
9 # Save model via the template model (which shares the same weights):
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1710 initial_epoch=initial_epoch,
1711 steps_per_epoch=steps_per_epoch,
-> 1712 validation_steps=validation_steps)
1713
1714 def evaluate(self, x=None, y=None,
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
1233 ins_batch[i] = ins_batch[i].toarray()
1234
-> 1235 outs = f(ins_batch)
1236 if not isinstance(outs, list):
1237 outs = [outs]
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py in __call__(self, inputs)
2473 session = get_session()
2474 updated = session.run(fetches=fetches, feed_dict=feed_dict,
-> 2475 **self.session_kwargs)
2476 return updated[:len(self.outputs)]
2477
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
903 try:
904 result = self._run(None, fetches, feed_dict, options_ptr,
--> 905 run_metadata_ptr)
906 if run_metadata:
907 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1135 if final_fetches or final_targets or (handle and feed_dict_tensor):
1136 results = self._do_run(handle, final_targets, final_fetches,
-> 1137 feed_dict_tensor, options, run_metadata)
1138 else:
1139 results = []
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1353 if handle is None:
1354 return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1355 options, run_metadata)
1356 else:
1357 return self._do_call(_prun_fn, self._session, handle, feeds, fetches)
C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1372 except KeyError:
1373 pass
-> 1374 raise type(e)(node_def, op, message)
1375
1376 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[128,128,109,109] and type float on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
[[Node: replica_1/xception/block2_sepconv1_bn/FusedBatchNorm = FusedBatchNormT=DT_FLOAT, data_format="NHWC", epsilon=0.001, is_training=true, _device="/job:localhost/replica:0/task:0/device:GPU:1"]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[Node: training/RMSprop/gradients/replica_0/xception/block7_sepconv2_bn/cond/batchnorm/mul_grad/Mul_1/_2981 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_16749...grad/Mul_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Caused by op 'replica_1/xception/block2_sepconv1_bn/FusedBatchNorm', defined at:
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py", line 16, in
app.launch_new_instance()
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelapp.py", line 478, in start
self.io_loop.start()
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tornado\ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(args, *kwargs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(args, *kwargs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(args, *kwargs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(args, *kwargs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes
if self.run_code(code, result):
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "
parallel_model = multi_gpu_model(model, gpus=2)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\utils\training_utils.py", line 175, in multi_gpu_model
outputs = model(inputs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\topology.py", line 617, in __call__
output = self.call(inputs, *kwargs)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\topology.py", line 2081, in call
output_tensors, _, _ = self.run_internal_graph(inputs, masks)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\topology.py", line 2232, in run_internal_graph
output_tensors = _to_list(layer.call(computed_tensor, *kwargs))
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\layers\normalization.py", line 181, in call
epsilon=self.epsilon)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py", line 1824, in normalize_batch_in_training
epsilon=epsilon)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py", line 1799, in _fused_normalize_batch_in_training
data_format=tf_data_format)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\nn_impl.py", line 906, in fused_batch_norm
name=name)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 2569, in _fused_batch_norm
is_training=is_training, name=name)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 3271, in create_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[128,128,109,109] and type float on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
[[Node: replica_1/xception/block2_sepconv1_bn/FusedBatchNorm = FusedBatchNormT=DT_FLOAT, data_format="NHWC", epsilon=0.001, is_training=true, _device="/job:localhost/replica:0/task:0/device:GPU:1"]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[Node: training/RMSprop/gradients/replica_0/xception/block7_sepconv2_bn/cond/batchnorm/mul_grad/Mul_1/_2981 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_16749...grad/Mul_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Keras Script:
import tensorflow as tf
from keras.applications import Xception
from keras.utils import multi_gpu_model
import numpy as np
num_samples = 1000
height = 224
width = 224
num_classes = 1000
with tf.device('/cpu:0'):
model = Xception(weights=None,
input_shape=(height, width, 3),
classes=num_classes)
parallel_model = multi_gpu_model(model, gpus=8)
parallel_model.compile(loss='categorical_crossentropy',
optimizer='rmsprop')
x = np.random.random((num_samples, height, width, 3))
y = np.random.random((num_samples, num_classes))
fit call will be distributed on 8 GPUs.parallel_model.fit(x, y, epochs=20, batch_size=256)
model.save('my_model.h5')
multi_gpu_model(model, gpus=8) you specify 8 gpus instead of 2
Note copy and paste error. The script executed was:
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Not gpus=8.
Sorry about the confusion.
Then try to reduce the batch_size, you ask for a batch_size of 256//2 = 128 which is pretty large for 224x224 images.
That worked. Set batch size to 32. Thanks!
For the people stuck with this in models other than mnist.
the reason for this is the high amount of parameters (please check your model.summary()).
A good method to drastically lower these parameters is to add:
subsample=(2, 2) (careful it lowers the resolution of images/data) in all the Convolutional layers above that Flatten layer,
if subsample doesn't work then it is stride=(2, 2).
Most helpful comment
Then try to reduce the batch_size, you ask for a batch_size of 256//2 = 128 which is pretty large for 224x224 images.