Using multi_gpu to train model :
def multi_gpu_test_simple_model():
print('####### test simple model')
num_samples = 1000
input_dim = 10
output_dim = 1
hidden_dim = 10
gpus = 2
epochs = 8
model = keras.models.Sequential()
model.add(keras.layers.Dense(hidden_dim,
input_shape=(input_dim,)))
model.add(keras.layers.Dense(output_dim))
x = np.random.random((num_samples, input_dim))
y = np.random.random((num_samples, output_dim))
parallel_model = multi_gpu_model(model, gpus=gpus)
from keras.callbacks import ModelCheckpoint
parallel_model.compile(loss='mse', optimizer='rmsprop')
parallel_model.fit(x, y, epochs=epochs)
from keras.models import save_model
save_model(parallel_model, '1.h5', overwrite=True, include_optimizer=True)
Error : can't pickle NoImplementedType objects
please solve this problem
I had the same error when saving the whole model. Try saving the weights only:
from keras.models import save_weights
save_weights(parallel_model, '1.h5', overwrite=True, include_optimizer=True)
I general I recommend only saving the "template model" (the model built on
CPU that you passed to multi_gpu_model).
On 1 November 2017 at 13:03, gabrielleyr notifications@github.com wrote:
I had the same error when saving the whole model. Try saving the weights
only:from keras.models import save_weights
save_weights(parallel_model, '1.h5', overwrite=True, include_optimizer=True)—
You are receiving this because you are subscribed to this thread.
Reply to this email directly, view it on GitHub
https://github.com/fchollet/keras/issues/8253#issuecomment-341223553,
or mute the thread
https://github.com/notifications/unsubscribe-auth/AArWbxTNQ81vsUYSMSbd2-_qLcJtY8dtks5syM6agaJpZM4QHgpP
.
@fchollet I tried doing exactly that. Created a simple custom version of ModelCheckpoint that saves original model on epoch end. It breaks with Type error: can't pickle module objects error
Epoch 1/100
27/28 [===========================>..] - ETA: 0s - loss: 0.4386
Validation loss decreased from inf to 0.12643371584514776, saving model
Traceback (most recent call last):
File "./scripts/fcn/train_model.py", line 91, in <module>
main()
File "./scripts/fcn/train_model.py", line 85, in main
callbacks=get_callbacks(model, model_path)
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 2117, in fit_generator
callbacks.on_epoch_end(epoch, epoch_logs)
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/callbacks.py", line 73, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "./scripts/fcn/train_model.py", line 35, in on_epoch_end
self.model.save(self.path, overwrite=True)
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/engine/topology.py", line 2556, in save
save_model(self, filepath, overwrite, include_optimizer)
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/models.py", line 107, in save_model
'config': model.get_config()
File "/home/kuba/anaconda3/lib/python3.6/site-packages/keras/engine/topology.py", line 2397, in get_config
return copy.deepcopy(config)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 240, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 215, in _deepcopy_list
append(deepcopy(a, memo))
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 240, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 240, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 220, in _deepcopy_tuple
y = [deepcopy(a, memo) for a in x]
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 220, in <listcomp>
y = [deepcopy(a, memo) for a in x]
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 220, in _deepcopy_tuple
y = [deepcopy(a, memo) for a in x]
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 220, in <listcomp>
y = [deepcopy(a, memo) for a in x]
File "/home/kuba/anaconda3/lib/python3.6/copy.py", line 169, in deepcopy
rv = reductor(4)
TypeError: can't pickle module objects
Here's a minimal code (with parts irrelevant to the problem ignored) I used to saved the base model while training multi_gpu_model:
class CustomModelCheckpoint(keras.callbacks.Callback):
def __init__(self, model, path):
super().__init__()
self.model = model
self.path = path
self.best_loss = np.inf
def on_epoch_end(self, epoch, logs=None):
loss = logs['val_loss']
if loss < self.best_loss:
print("\nValidation loss decreased from {} to {}, saving model".format(self.best_loss, loss))
self.model.save_weights(self.path, overwrite=True)
self.best_loss = loss
def main():
model = get_model()
multi_gpu_model = keras.utils.training_utils.multi_gpu_model(model, gpus=2)
multi_gpu_model.compile(optimizer='adam', loss='binary_crossentropy')
multi_gpu_model.fit_generator(
// ...
callbacks=[CustomModelCheckpoint(model, model_path)])
When I use model.save_weights(...) in callback, like in code above, model can be successfully saved. When using just model.save(...), code breaks with error from previous post.
However, even when using model.save_weights(...), loading model with model.load_weights(...) fails with
ValueError: You are trying to load a weight file containing 1 layers into a model with 19 layers.
My tensorflow version is 1.3.0 and Keras version is 2.0.8
Looking at keras.callbacks.ModelCheckpoint class, it uses self.model object in on_epoch_end(...) and calls its save(...) and save_weights(...) functions, as appropriate. I don't see how does it obtain the self.model object though.
Please correct me if I'm wrong, but maybe some magic is going under the hood of save(...)/save_weights(...) calls that dynamically assigns meaning to model object based on Tensorflow graph variables. Maybe model.save_weight() is not looking at model object at all - maybe it's fetching leaf tensors from Tensorflow graph? In that case even if I do provide a valid CPU-build model object, what gets saved is what magic inside save_weight() found, which might be multi_gpu_model tensors.
As mentioned in #8123 the problem is the: import tensorflow as tf line in the multi_gpu_model function.
I've followed fchollet's advice about saving the model and weights separately. Wound up with this, which is working fine. You'll have to update original_model() to pull the correct layer for your architecture.
def original_model(parallel_model):
return parallel_model.get_layer('sequential_1')
class ParallelModelCheckpoint(ModelCheckpoint):
def __init__(self, path):
super().__init__(path, save_weights_only=True)
def set_model(self, model):
super().set_model(original_model(model))
model = get_model()
parallel_model = multi_gpu_model(model, gpus=2)
model.compile(...)
model.save(...)
checkpoint = ParallelModelCheckpoint('out/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
parallel_model.compile(...)
parallel_model.fit(..., callbacks=[checkpoint])
@jmcconnell , I use functional API to create the model. When I use your code blow, it gives error:
in get_layer ValueError: No such layer: sequential_1.
I am frustrated about saving the model/weight of each epoch when using multi_gpu_model. It always gives some kinds of errors.
def original_model(parallel_model):
return parallel_model.get_layer('sequential_1')
class ParallelModelCheckpoint(ModelCheckpoint):
def __init__(self, path):
super().__init__(path, save_weights_only=True)
def set_model(self, model):
super().set_model(original_model(model))
model = get_model()
parallel_model = multi_gpu_model(model, gpus=2)
model.compile(...)
model.save(...)
checkpoint = ParallelModelCheckpoint('out/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
parallel_model.compile(...)
parallel_model.fit(..., callbacks=[checkpoint])
@fchollet, would you please give an example for how to save the model/weights trained by each epoch when using multi_gpu_model?
I always got some errors.
@PBehr 's suggestion worked for me.
@shu-hai yeah, I mentioned you'll have to update original_model() to work with your architecture. I'm sure there is a more generalized way of grabbing the correct layer, but I am new to Keras and didn't have time to look into it.
Run a print(parallel_model.summary()) to see what the layer for your original model is called.
The solution by @maxim5 worked for me: https://github.com/keras-team/keras/issues/8123#issuecomment-354857044
Closing as this is resolved
Most helpful comment
As mentioned in #8123 the problem is the: import tensorflow as tf line in the multi_gpu_model function.