Ray: Pickle Error with TF2.0

Created on 14 Oct 2019  路  9Comments  路  Source: ray-project/ray

System information

Describe the problem

I cannot use ray with Tensorflow 2.0 (stable). Even using the documentation code at https://ray.readthedocs.io/en/latest/using-ray-with-tensorflow.html throws the same pickling error as my own code:

Source code / logs

Here for quick copying is the code:

#!/usr/bin/env python
"""TODO Module Docstring."""

import numpy as np
import ray
import tensorflow as tf
from tensorflow.keras import layers


def create_keras_model():
    model = tf.keras.Sequential()
    # Adds a densely-connected layer with 64 units to the model:
    model.add(layers.Dense(64, activation="relu", input_shape=(32,)))
    # Add another:
    model.add(layers.Dense(64, activation="relu"))
    # Add a softmax layer with 10 output units:
    model.add(layers.Dense(10, activation="softmax"))

    model.compile(
        optimizer=tf.train.RMSPropOptimizer(0.01),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=[tf.keras.metrics.categorical_accuracy])
    return model


def random_one_hot_labels(shape):
    n, n_class = shape
    classes = np.random.randint(0, n_class, n)
    labels = np.zeros((n, n_class))
    labels[np.arange(n), classes] = 1
    return labels


ray.init()


@ray.remote
class Network(object):
    def __init__(self):
        self.model = create_keras_model()
        self.dataset = np.random.random((1000, 32))
        self.labels = random_one_hot_labels((1000, 10))

    def train(self):
        history = self.model.fit(self.dataset, self.labels, verbose=False)
        return history.history

    def get_weights(self):
        return self.model.get_weights()

    def set_weights(self, weights):
        # Note that for simplicity this does not handle the optimizer state.
        self.model.set_weights(weights)


NetworkActor = Network.remote()
result_object_id = NetworkActor.train.remote()
ray.get(result_object_id)

This gives a TypeError:

Traceback (most recent call last):
  File "/home/***/test.py", line 56, in <module>
    NetworkActor = Network.remote()
  File "/home/***/venv/lib/python3.6/site-packages/ray/actor.py", line 322, in remote
    return self._remote(args=args, kwargs=kwargs)
  File "/home/***/venv/lib/python3.6/site-packages/ray/actor.py", line 405, in _remote
    self._modified_class, self._actor_method_names)
  File "/home/***/venv/lib/python3.6/site-packages/ray/function_manager.py", line 578, in export_actor_class
    "class": pickle.dumps(Class),
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 1123, in dumps
    cp.dump(obj)
  File "/home/***/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 482, in dump
    return Pickler.dump(self, obj)
  File "/usr/lib/python3.6/pickle.py", line 409, in dump
    self.save(obj)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 875, in save_global
    self.save_dynamic_class(obj)
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 682, in save_dynamic_class
    obj=obj)
  File "/usr/lib/python3.6/pickle.py", line 610, in save_reduce
    save(args)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple
    save(element)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 736, in save_tuple
    save(element)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 875, in save_global
    self.save_dynamic_class(obj)
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 686, in save_dynamic_class
    save(clsdict)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
    self._batch_setitems(obj.items())
  File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
    save(v)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 556, in save_function
    return self.save_function_tuple(obj)
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 756, in save_function_tuple
    save(state)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
    self._batch_setitems(obj.items())
  File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
    save(v)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
    self._batch_setitems(obj.items())
  File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
    save(v)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 556, in save_function
    return self.save_function_tuple(obj)
  File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 756, in save_function_tuple
    save(state)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
    self._batch_setitems(obj.items())
  File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
    save(v)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 781, in save_list
    self._batch_appends(obj)
  File "/usr/lib/python3.6/pickle.py", line 808, in _batch_appends
    save(tmp[0])
  File "/usr/lib/python3.6/pickle.py", line 496, in save
    rv = reduce(self.proto)
TypeError: can't pickle _LazyLoader objects

Most helpful comment

Try this?

import numpy as np
import ray
from tensorflow.keras import layers


def create_keras_model():
    import tensorflow as tf
    model = tf.keras.Sequential()
    # Adds a densely-connected layer with 64 units to the model:
    model.add(layers.Dense(64, activation="relu", input_shape=(32,)))
    # Add another:
    model.add(layers.Dense(64, activation="relu"))
    # Add a softmax layer with 10 output units:
    model.add(layers.Dense(10, activation="softmax"))

    model.compile(
        optimizer=tf.train.RMSPropOptimizer(0.01),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=[tf.keras.metrics.categorical_accuracy])
    return model


def random_one_hot_labels(shape):
    n, n_class = shape
    classes = np.random.randint(0, n_class, n)
    labels = np.zeros((n, n_class))
    labels[np.arange(n), classes] = 1
    return labels


ray.init()


@ray.remote
class Network(object):
    def __init__(self):
        self.model = create_keras_model()
        self.dataset = np.random.random((1000, 32))
        self.labels = random_one_hot_labels((1000, 10))

    def train(self):
        history = self.model.fit(self.dataset, self.labels, verbose=False)
        return history.history

    def get_weights(self):
        return self.model.get_weights()

    def set_weights(self, weights):
        # Note that for simplicity this does not handle the optimizer state.
        self.model.set_weights(weights)


NetworkActor = Network.remote()
result_object_id = NetworkActor.train.remote()
ray.get(result_object_id)

All 9 comments

Try this?

import numpy as np
import ray
from tensorflow.keras import layers


def create_keras_model():
    import tensorflow as tf
    model = tf.keras.Sequential()
    # Adds a densely-connected layer with 64 units to the model:
    model.add(layers.Dense(64, activation="relu", input_shape=(32,)))
    # Add another:
    model.add(layers.Dense(64, activation="relu"))
    # Add a softmax layer with 10 output units:
    model.add(layers.Dense(10, activation="softmax"))

    model.compile(
        optimizer=tf.train.RMSPropOptimizer(0.01),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=[tf.keras.metrics.categorical_accuracy])
    return model


def random_one_hot_labels(shape):
    n, n_class = shape
    classes = np.random.randint(0, n_class, n)
    labels = np.zeros((n, n_class))
    labels[np.arange(n), classes] = 1
    return labels


ray.init()


@ray.remote
class Network(object):
    def __init__(self):
        self.model = create_keras_model()
        self.dataset = np.random.random((1000, 32))
        self.labels = random_one_hot_labels((1000, 10))

    def train(self):
        history = self.model.fit(self.dataset, self.labels, verbose=False)
        return history.history

    def get_weights(self):
        return self.model.get_weights()

    def set_weights(self, weights):
        # Note that for simplicity this does not handle the optimizer state.
        self.model.set_weights(weights)


NetworkActor = Network.remote()
result_object_id = NetworkActor.train.remote()
ray.get(result_object_id)

Works! Interesting, given that with python's multiprocessing the local import is not necessary. Thanks for the help!

Can we get more on why it works when the python import tensor_flow as tf moved inside the function ?
I struggle with

  File "/home/xx/.local/python/lib/python3.7/pickle.py", line 524, in save
    rv = reduce(self.proto)
  File "/home/xx/.local/python/lib/python3.7/socket.py", line 192, in __getstate__
    raise TypeError("Cannot serialize socket object")
TypeError: Cannot serialize socket object

Maybe I missed it, but is Ray trying to serialize even import contexts , not just what we pass with ray.put ?

serialize

same error here, have you solved it? :)

@pengyuan-zhou can you post a stacktrace? (and reopen this issue when you do?)

@pengyuan-zhou can you post a stacktrace? (and reopen this issue when you do?)

Sure, it happens when I'm calling ray from another project, flow.
The error appears like this,

2020-06-22 10:52:00,288 INFO services.py:1170 -- View the Ray dashboard at localhost:8265
Traceback (most recent call last):
  File "examples/train.py", line 404, in <module>
    main(sys.argv[1:])
  File "examples/train.py", line 397, in main
    train_h_baselines(flow_params, args, multiagent)
  File "examples/train.py", line 304, in train_h_baselines
    **hp
  File "/home/pengzhou/git/h-baselines/hbaselines/algorithms/off_policy.py", line 375, in __init__
    for env_num in range(num_envs)
  File "/home/pengzhou/git/h-baselines/hbaselines/algorithms/off_policy.py", line 375, in <listcomp>
    for env_num in range(num_envs)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/actor.py", line 378, in remote
    return self._remote(args=args, kwargs=kwargs)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/actor.py", line 556, in _remote
    extension_data=str(actor_method_cpu))
  File "python/ray/_raylet.pyx", line 918, in ray._raylet.CoreWorker.create_actor
  File "python/ray/_raylet.pyx", line 923, in ray._raylet.CoreWorker.create_actor
  File "python/ray/_raylet.pyx", line 280, in ray._raylet.prepare_args
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/serialization.py", line 401, in serialize
    return self._serialize_to_msgpack(metadata, value)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/serialization.py", line 373, in _serialize_to_msgpack
    self._serialize_to_pickle5(metadata, python_objects)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/serialization.py", line 353, in _serialize_to_pickle5
    raise e
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/serialization.py", line 350, in _serialize_to_pickle5
    value, protocol=5, buffer_callback=writer.buffer_callback)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 617, in dump
    return Pickler.dump(self, obj)
  File "/home/pengzhou/miniconda3/envs/flow/lib/python3.7/socket.py", line 192, in __getstate__
    raise TypeError("Cannot serialize socket object")
TypeError: Cannot serialize socket object

Relevant libs are:

ray==0.8.5
numpy==1.18.4
tensorflow==1.15.2
tensorflow-probability==0.8.0
gym==0.14.0
pygame
opencv-python
dm-tree
ray[tune]
scipy==1.1.0
lxml==4.4.1
pyprind==2.11.2
nose2==0.8.0
six==1.11.0
path.py
joblib==0.10.3
python-dateutil==2.7.3
cached_property
cloudpickle==1.2.0
pyglet==1.3.2
matplotlib==3.1.0
imutils==0.5.1
numpydoc
dill
lz4
setproctitle
psutil
boto3==1.4.8
redis~=2.10.6
pandas==0.24.2
plotly==2.4.0

I also tried ray==0.8.5 and redis==3.4.1, same error.
Thanks.
BR.

@pengyuan-zhou can you post a stacktrace? (and reopen this issue when you do?)

Ya I can't reopen the issue by others in this project

Does this help?

Side note:

It is enough to move the definition of functions needing TensorFlow to a sperate module. The important part seems to be that TensorFlow is not imported in the module where tune.run() is called.

Was this page helpful?
0 / 5 - 0 ratings