Continuing a discussion from #439, and as pointed out by @raopku, the following does not work.
import ray
ray.init()
import cloudpickle
class A(object):
pass
@ray.remote
class B(A):
def __init__(self):
super(B, self).__init__()
B.remote()
At the root of the problem is the fact that cloudpickle doesn't seem to be able to serialize subclasses that call super. For example, consider the following.
import cloudpickle
class A(object):
pass
class B(A):
def __init__(self):
super(B, self).__init__()
cloudpickle.dumps(B)
The last line fails with the following exception.
---------------------------------------------------------------------------
RecursionError Traceback (most recent call last)
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
145 try:
--> 146 return Pickler.dump(self, obj)
147 except RuntimeError as e:
/Users/rkn/anaconda3/lib/python3.6/pickle.py in dump(self, obj)
408 self.framer.start_framing()
--> 409 self.save(obj)
410 self.write(STOP)
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_global(self, obj, name, pack)
424
--> 425 self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
426 else:
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
585 save(func)
--> 586 save(args)
587 write(pickle.REDUCE)
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_tuple(self, obj)
735 for element in obj:
--> 736 save(element)
737 # Subtle. Same as in the big comment below.
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_dict(self, obj)
820 self.memoize(obj)
--> 821 self._batch_setitems(obj.items())
822
/Users/rkn/anaconda3/lib/python3.6/pickle.py in _batch_setitems(self, items)
846 save(k)
--> 847 save(v)
848 write(SETITEMS)
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_function(self, obj, name)
263 or themodule is None):
--> 264 self.save_function_tuple(obj)
265 return
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_function_tuple(self, func)
311 save(_make_skel_func)
--> 312 save((code, closure, base_globals))
313 write(pickle.REDUCE)
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_tuple(self, obj)
735 for element in obj:
--> 736 save(element)
737 # Subtle. Same as in the big comment below.
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_list(self, obj)
780 self.memoize(obj)
--> 781 self._batch_appends(obj)
782
/Users/rkn/anaconda3/lib/python3.6/pickle.py in _batch_appends(self, items)
807 elif n:
--> 808 save(tmp[0])
809 write(APPEND)
... last 16 frames repeated, from the frame below ...
/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
RecursionError: maximum recursion depth exceeded in comparison
During handling of the above exception, another exception occurred:
PicklingError Traceback (most recent call last)
<ipython-input-2-e105f7705909> in <module>()
----> 1 cloudpickle.dumps(B)
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dumps(obj, protocol)
704
705 cp = CloudPickler(file,protocol)
--> 706 cp.dump(obj)
707
708 return file.getvalue()
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
148 if 'recursion' in e.args[0]:
149 msg = """Could not pickle object as excessively deep recursion required."""
--> 150 raise pickle.PicklingError(msg)
151
152 def save_memoryview(self, obj):
PicklingError: Could not pickle object as excessively deep recursion required.
I think it is urgent to solve the problem of handling the kwarg of super class.
This problem blocks the develop of algorithm library based on ray. I think a algorithm library is unavoidable of class inheritance.
As a workaround, you can do the following. Avoid referring to the class name within the class definition. For example, suppose the base class is
class A(object):
pass
Instead of defining the actor as
@ray.actor
class B(A):
def __init__(self):
super(B, self).__init__()
# This fails.
B()
You can define it as follows.
@ray.actor
class B(A):
def __init__(self):
A.__init__(self)
# This works.
B()
@mehrdadn, you've looked a lot at serialization in Python. Is it correct that the problem has to do with referring to the name of B within the definition of B? Are there obvious solutions or workarounds that we're missing?
It indeed seems to be that they don't support a class referring to itself by name at all:
>>> class A:
def __init__(self): A
>>> import cloudpickle
>>> cloudpickle.dumps(A)
Traceback (most recent call last):
File "<pyshell#5>", line 1, in <module>
cloudpickle.dumps(A)
File "site-packages\cloudpickle\cloudpickle.py", line 706, in dumps
cp.dump(obj)
File "site-packages\cloudpickle\cloudpickle.py", line 146, in dump
return Pickler.dump(self, obj)
File "pickle.py", line 224, in dump
self.save(obj)
File "pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "site-packages\cloudpickle\cloudpickle.py", line 425, in save_global
self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
File "site-packages\cloudpickle\cloudpickle.py", line 590, in save_reduce
self.memoize(obj)
File "pickle.py", line 244, in memoize
assert id(obj) not in self.memo
AssertionError
It also seems to be strictly a cloudpickle bug, since pickle doesn't have any problem pickling this either.
The workaround for __init__ is what you mention. The workaround for other cases (which would be generally in the context of @staticmethod, since that's where you explicitly need the class name) is to just use @classmethod instead @staticmethod so that you can say cls.foo() instead of MyClass.foo(). Not sure what other cases there are, but it's likely this will also happen in at least some situations where two classes cross-reference each other; not sure of an easy workaround in that case though.
I would file this as a bug with cloudpickle.
Thanks @mehrdadn. In this case, pickle doesn't quite do the right thing either. It serializes
class A:
def __init__(self):
A
as b'\x80\x03c__main__\nA\nq\x00.', which doesn't seem to contain enough information to reconstruct the class. Calling pickle.loads on that string raises
AttributeError: Can't get attribute 'A' on <module '__main__'>
Oof, I forgot to delete A from the global namespace before loading it again, that makes sense.
That said, I still think this is a bug... I don't see why it shouldn't be able to work correctly.
Closing this for now since it seems to be a bug in cloudpickle, and we have a viable workaround which is to call BaseClass.__init__(self) instead of super().__init__().
The cloudpickle issue is now fixed with https://github.com/cloudpipe/cloudpickle/pull/102, this should be part of the next cloudpickle release.
It still doesn't work with Ray and the cloudpickle master, but there is new hope that we can fix it!
We haven't resolved this yet, but the following workaround seems to work.
import ray
ray.init()
class Foo(object):
def __init__(self):
self.x = 1
class Bar(Foo):
def __init__(self):
super(Bar, self).__init__()
def get_x(self):
return self.x
Bar2 = ray.remote(Bar)
b = Bar2.remote()
ray.get(b.get_x.remote())
cc @mehrdadn @royf
Any idea how I can do this if I am using actors that require a GPU? If I add a gpu argument to the example above, i.e.
Bar2 = ray.remote(Bar, num_gpus=1)
I get the following error:
File "/home/katadh/anaconda3/envs/softlearning/lib/python3.6/site-packages/ray/worker.py",
line 2440, in remote
assert len(args) == 0 and len(kwargs) > 0, error_string
AssertionError: The @ray.remote decorator must be applied either with no arguments
and no parentheses, for example '@ray.remote', or it must be applied using some of
the arguments 'num_return_vals', 'num_cpus', 'num_gpus', 'resources', 'max_calls', or
'max_reconstructions', like '@ray.remote(num_return_vals=2, resources={"CustomResource": 1})'
@katadh
If you are using Python 3 there seems to be a simple workaround to the entire problem. Just leave out the super arguments:
import ray
ray.init()
class Foo(object):
def __init__(self):
self.x = 1
@ray.remote
class Bar(Foo):
def __init__(self):
super().__init__()
def get_x(self):
return self.x
b = Bar.remote()
print(ray.get(b.get_x.remote()))
This did not cause an error for me and also worked with @ray.remote(num_gpus=1).
@amiranas Thanks! I'll try that
The original issue no longer seems completely accurate, but there is still a problem here.
If I run
import ray
ray.init()
import cloudpickle
class A(object):
pass
@ray.remote
class B(A):
def __init__(self):
super(B, self).__init__()
B.remote()
There is no issue with cloudpickle now, but the actor constructor fails with
2020-11-14 13:12:18,793 ERROR worker.py:1018 -- Possible unhandled error from worker: ray::B.__init__() (pid=73022, ip=192.168.1.104)
File "python/ray/_raylet.pyx", line 484, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
File "<ipython-input-1-aa83cb2b851c>", line 13, in __init__
TypeError: super() argument 1 must be type, not ActorClass(B)
Most helpful comment
We haven't resolved this yet, but the following workaround seems to work.
cc @mehrdadn @royf