keras-2.2.4, tensorflow-gpu 1.10.1
here is my network structure
from keras.models import Model
from keras.layers import Conv2D, BatchNormalization, Activation, Lambda, add, Input, concatenate
from keras import backend as K
from keras.applications.resnet50 import ResNet50
def conv_bn_relu(feature_map, filters, kernel, activation=True):
feature_map = Conv2D(filters, kernel, padding='same')(feature_map)
feature_map = BatchNormalization()(feature_map)
if activation:
feature_map = Activation('relu')(feature_map)
return feature_map
def bottleneck(inputs, depth, depth_bottleneck, stride=1):
residual = conv_bn_relu(inputs, depth_bottleneck, (1, 1))
residual = conv_bn_relu(residual, depth_bottleneck, (3, 3))
residual = conv_bn_relu(residual, depth, (1, 1), activation=False)
shortcut = conv_bn_relu(inputs, depth, [1, 1])
output = Activation('relu')(add([residual, shortcut]))
return output
def global_net(feature_maps, point_num):
global_feature_maps = []
global_outputs = []
for i, feature_map in enumerate(reversed(feature_maps)):
feature_map = conv_bn_relu(feature_map, 256, (1, 1))
if 'last_feature_map' in dir():
shape = feature_map.get_shape()
upsample = Lambda(lambda x: K.tf.image.resize_bilinear(x, (shape[1], shape[2])))(feature_map)
upsample = Conv2D(256, (1, 1), padding='same')(upsample)
last_feature_map = add([feature_map, upsample])
else:
last_feature_map = feature_map
tmp = conv_bn_relu(last_feature_map, 256, (1, 1))
out = conv_bn_relu(tmp, point_num, (3, 3))
out = Lambda(lambda x: K.tf.image.resize_bilinear(x, (128, 128)))(out)
global_feature_maps.append(last_feature_map)
global_outputs.append(out)
return global_feature_maps[::-1], global_outputs[::-1]
def refine_net(feature_maps, point_num):
refine_feature_maps = []
for i, feature_map in enumerate(feature_maps):
for j in range(i):
feature_map = bottleneck(feature_map, 256, 128)
feature_map = Lambda(lambda x: K.tf.image.resize_bilinear(x, (128, 128)))(feature_map)
refine_feature_maps.append(feature_map)
refine_feature_map = Lambda(lambda x:K.tf.concat(x, axis=3))(refine_feature_maps)
refine_feature_map = bottleneck(refine_feature_map, 256, 128)
res = conv_bn_relu(refine_feature_map, point_num, (3, 3))
return res
def build_cpn(style):
point_num = 13
backbone = ResNet50(weights='imagenet', input_shape=(512, 512, 3), include_top=False)
resnet_feature_maps = []
for layer_name in ['activation_10', 'activation_22', 'activation_40', 'activation_49']:
feature_maps = backbone.get_layer(layer_name).output
resnet_feature_maps.append(feature_maps)
global_feature_maps, global_outputs = global_net(resnet_feature_maps, point_num)
refine_output = refine_net(global_feature_maps, point_num)
cpn_outputs = global_outputs + [refine_output]
cpn_outputs = concatenate(cpn_outputs)
cpn = Model(inputs=backbone.input, outputs=cpn_outputs)
return cpn
We can't reproduce an error with just a network. We need also data and the training code. For the data, please use np.zeros.
We can't reproduce an error with just a network. We need also data and the training code. For the data, please use
np.zeros.
here is my train code, and I use 4 gtx 1080
import tensorflow as tf
from keras.utils import multi_gpu_model
def l2_loss(y_true, y_pred):
y_true = K.repeat_elements(y_true, rep=5, axis=-1)
loss = K.tf.reduce_mean(K.square(y_true - y_pred))
return loss
x = np.zeros([10,512,512,3])
y = np.zeros([10,128,128,13])
cpn = build_cpn(None)
multi_model = multi_gpu_model(cpn, gpus=4)
multi_model.compile(optimizer='Adam', loss=l2_loss)
multi_model.fit(x, y)
Can you try:
import tensorflow as tf
from keras.utils import multi_gpu_model
def l2_loss(y_true, y_pred):
y_true = K.repeat_elements(y_true, rep=5, axis=-1)
loss = K.tf.reduce_mean(K.square(y_true - y_pred))
return loss
x = np.zeros([10,512,512,3])
y = np.zeros([10,128,128,13])
cpn = build_cpn(None)
cpn .compile(optimizer='Adam', loss=l2_loss)
cpn .fit(x, y)
And report if the bug is still there?
Can you try:
import tensorflow as tf from keras.utils import multi_gpu_model def l2_loss(y_true, y_pred): y_true = K.repeat_elements(y_true, rep=5, axis=-1) loss = K.tf.reduce_mean(K.square(y_true - y_pred)) return loss x = np.zeros([10,512,512,3]) y = np.zeros([10,128,128,13]) cpn = build_cpn(None) cpn .compile(optimizer='Adam', loss=l2_loss) cpn .fit(x, y)And report if the bug is still there?
single model doesn't have any bug
Thank you for reporting. Can you try to remove layers and keep the bug there? having the bug on a really small network will help us debug quickly the issue. The fewer layers it has, the easier it will be to flush out the bug.
Thank you for reporting. Can you try to remove layers and keep the bug there? having the bug on a really small network will help us debug quickly the issue. The fewer layers it has, the easier it will be to flush out the bug.
I found the bug is in add function
from keras.models import Model
from keras.layers import Conv2D, Lambda, add , concatenate
from keras import backend as K
from keras.applications.resnet50 import ResNet50
import tensorflow as tf
from keras.utils import multi_gpu_model
import numpy as np
def global_net(feature_maps, point_num):
global_outputs = []
for i, feature_map in enumerate(reversed(feature_maps)):
if 'last_feature_map' in dir():
shape = feature_map.get_shape()
upsample = Lambda(lambda x: K.tf.image.resize_bilinear(x, (shape[1], shape[2])))(feature_map)
#-------bug here-------#
last_feature_map = add([feature_map, upsample]) # bug here
#-------bug here-------#
# last_feature_map = upsample
else:
last_feature_map = feature_map
out = Conv2D(point_num, (3, 3), padding='same')(last_feature_map)
out = Lambda(lambda x: K.tf.image.resize_bilinear(x, (128, 128)))(out)
global_outputs.append(out)
return global_outputs[::-1]
def build_cpn(style):
point_num = 13
backbone = ResNet50(weights='imagenet', input_shape=(512, 512, 3), include_top=False)
resnet_feature_maps = []
for layer_name in ['activation_10', 'activation_22', 'activation_40', 'activation_49']:
feature_maps = backbone.get_layer(layer_name).output
resnet_feature_maps.append(feature_maps)
global_outputs = global_net(resnet_feature_maps, point_num)
cpn_outputs = concatenate(global_outputs)
cpn = Model(inputs=backbone.input, outputs=cpn_outputs)
return cpn
def l2_loss(y_true, y_pred):
y_true = K.repeat_elements(y_true, rep=4, axis=-1)
loss = K.tf.reduce_mean(K.square(y_true - y_pred))
return loss
x = np.zeros([10,512,512,3])
y = np.zeros([10,128,128,13])
cpn = build_cpn(None)
# cpn.compile(optimizer='Adam', loss=l2_loss)
# cpn.fit(x, y, batch_size=2)
multi_model = multi_gpu_model(cpn, gpus=4)
multi_model.compile(optimizer='Adam', loss=l2_loss)
multi_model.fit(x, y)
@gabrieldemarmiesse - Hi, based on the above conversation just wanted to check if you already looked into this issue and if so, keep us posted with your comments on this. Please let us know if you haven't looked into it. So that we can triage this further. Thanks !
Really sorry, I lost track of this issue by lack of organisation and time. If you could please follow up on this in my stead it would be very appreciated.
From what I read it really looks like a bug. Since I don't have a multi gpu setup, I couldn't try to reproduce it, so I can't confirm anything.
From what I read it really looks like a bug. Since I don't have a multi gpu setup, I couldn't try to reproduce it, so I can't confirm anything.
Just to confirm, have you tried to reproduce in colab ?
No, can we use multiple gpus in colab?
@gabrieldemarmiesse hello, I think I got same problem with the function 'multi_gpu_model'. My model compiles OK when using single GPU, but when I run my model on multi GPUs, I met a problem of shape mismatching.
Error message: tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimension 1 in both shapes must be equal, but are 256 and 128. Shapes are [?,256,256] and [?,128,128]. for 'net-_t/concatenate_82/concat' (op: 'ConcatV2') with input shapes: [?,256,256,196], [?,128,128,104], [] and with computed input tensors: input[2] = <3>.
I have tried to solve this problem and I found the bug is in multi_gpu_model function. I think this error is caused by this code:outputs = model(inputs) in the function. I tested this code individually and it caused the same error. Could you help me to figure it out?
I don't have a multi gpu setup to work on this piece of code. I've flagged this issue as bug because I think it needs attention. So hopefully someone will fix it in the future.
@gabrieldemarmiesse Hi~, I've found something new!I have same code in my model like @GodsDusk.
Code:shape = feature_map.get_shape()
upsample = Lambda(lambda x: K.tf.image.resize_bilinear(x, (shape[1], shape[2])))(feature_map)
When I changed the code to UpSampling2D(size=(2, 2)), my model can complie on multi GPUs. So, maybe, there is something wrong with tf.image.resize_bilinear in Keras.
Is there a bilinear interpolation algorithm in Keras?
The odd thing is, when I use tf.image.resize_bilinear in other places to concatenate, it didn't cause error. It only makes mistakes in one place.
Thanks @stillwaterman , you may be right about the bilinear issue. I'll let @fchollet investigate.
I have the same. Cut off pretty much everything down to a single layer, changed sizes - nothing works.
code:
import tensorflow as tf
import keras
window_size = 1024
inputs_n = 128
outputs_n = 128
neurons = 128
n_steps = len(days[0][1]) - window_size
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
model = Sequential()
model.add(LSTM(neurons, batch_input_shape=(window_size, n_steps, inputs_n), stateful=True))
from keras.utils import multi_gpu_model
parallel_model = multi_gpu_model(model, gpus=2)
error:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in _create_c_op(graph, node_def, inputs, control_inputs)
1658 try:
-> 1659 c_op = c_api.TF_FinishOperation(op_desc)
1660 except errors.InvalidArgumentError as e:
InvalidArgumentError: Dimensions must be equal, but are 512 and 1024 for 'replica_0_8/sequential_12/lstm_12/add' (op: 'Add') with input shapes: [512,128], [1024,128].
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-18-db18560e3da4> in <module>
22
23 from keras.utils import multi_gpu_model
---> 24 parallel_model = multi_gpu_model(model, gpus=2)
25
26 #parallel_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
~/.local/lib/python3.5/site-packages/keras/utils/multi_gpu_utils.py in multi_gpu_model(model, gpus, cpu_merge, cpu_relocation)
225 # Apply model on slice
226 # (creating a model replica on the target device).
--> 227 outputs = model(inputs)
228 outputs = to_list(outputs)
229
~/.local/lib/python3.5/site-packages/keras/engine/base_layer.py in __call__(self, inputs, **kwargs)
455 # Actually call the layer,
456 # collecting output(s), mask(s), and shape(s).
--> 457 output = self.call(inputs, **kwargs)
458 output_mask = self.compute_mask(inputs, previous_mask)
459
~/.local/lib/python3.5/site-packages/keras/engine/network.py in call(self, inputs, mask)
562 return self._output_tensor_cache[cache_key]
563 else:
--> 564 output_tensors, _, _ = self.run_internal_graph(inputs, masks)
565 return output_tensors
566
~/.local/lib/python3.5/site-packages/keras/engine/network.py in run_internal_graph(self, inputs, masks)
719 kwargs['mask'] = computed_mask
720 output_tensors = to_list(
--> 721 layer.call(computed_tensor, **kwargs))
722 output_masks = layer.compute_mask(computed_tensor,
723 computed_mask)
~/.local/lib/python3.5/site-packages/keras/layers/recurrent.py in call(self, inputs, mask, training, initial_state)
2192 mask=mask,
2193 training=training,
-> 2194 initial_state=initial_state)
2195
2196 @property
~/.local/lib/python3.5/site-packages/keras/layers/recurrent.py in call(self, inputs, mask, training, initial_state, constants)
647 mask=mask,
648 unroll=self.unroll,
--> 649 input_length=timesteps)
650 if self.stateful:
651 updates = []
~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in rnn(step_function, inputs, initial_states, go_backwards, mask, constants, unroll, input_length)
2920
2921 time_steps = tf.shape(inputs)[0]
-> 2922 outputs, _ = step_function(inputs[0], initial_states + constants)
2923 output_ta = tensor_array_ops.TensorArray(
2924 dtype=outputs.dtype,
~/.local/lib/python3.5/site-packages/keras/layers/recurrent.py in step(inputs, states)
638 else:
639 def step(inputs, states):
--> 640 return self.cell.call(inputs, states, **kwargs)
641
642 last_output, outputs, states = K.rnn(step,
~/.local/lib/python3.5/site-packages/keras/layers/recurrent.py in call(self, inputs, states, training)
1971 h_tm1_o = h_tm1
1972 i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
-> 1973 self.recurrent_kernel_i))
1974 f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
1975 self.recurrent_kernel_f))
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py in binary_op_wrapper(x, y)
810 with ops.name_scope(None, op_name, [x, y]) as name:
811 if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
--> 812 return func(x, y, name=name)
813 elif not isinstance(y, sparse_tensor.SparseTensor):
814 try:
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py in add(x, y, name)
363 try:
364 _, _, _op = _op_def_lib._apply_op_helper(
--> 365 "Add", x=x, y=y, name=name)
366 except (TypeError, ValueError):
367 result = _dispatch.dispatch(
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py in _apply_op_helper(self, op_type_name, name, **keywords)
786 op = g.create_op(op_type_name, inputs, output_types, name=scope,
787 input_types=input_types, attrs=attr_protos,
--> 788 op_def=op_def)
789 return output_structure, op_def.is_stateful, op
790
/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py in new_func(*args, **kwargs)
505 'in a future version' if date is None else ('after %s' % date),
506 instructions)
--> 507 return func(*args, **kwargs)
508
509 doc = _add_deprecated_arg_notice_to_docstring(
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in create_op(***failed resolving arguments***)
3298 input_types=input_types,
3299 original_op=self._default_original_op,
-> 3300 op_def=op_def)
3301 self._create_op_helper(ret, compute_device=compute_device)
3302 return ret
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in __init__(self, node_def, g, inputs, output_types, control_inputs, input_types, original_op, op_def)
1821 op_def, inputs, node_def.attr)
1822 self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
-> 1823 control_input_ops)
1824
1825 # Initialize self._outputs.
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in _create_c_op(graph, node_def, inputs, control_inputs)
1660 except errors.InvalidArgumentError as e:
1661 # Convert to ValueError for backwards compatibility.
-> 1662 raise ValueError(str(e))
1663
1664 return c_op
ValueError: Dimensions must be equal, but are 512 and 1024 for 'replica_0_8/sequential_12/lstm_12/add' (op: 'Add') with input shapes: [512,128], [1024,128].
Most helpful comment
I have the same. Cut off pretty much everything down to a single layer, changed sizes - nothing works.
code:
error: