Ray: Trouble running pytorch models on ray tune

Created on 18 Oct 2018 · 7Comments · Source: ray-project/ray

System information

Ubuntu 16.04.3 LTS:
Ray installed from the repository:
Ray version 0.5.3:
Python version Python 3.6:
Pytorch version 0.4:

My code

INPUT_PATH = '../data/'
OUTPUT_PATH = '../models/'
model_path = '../models/'

import model_train as mt
import torch.nn.functional as F
import ray
import ray.tune as tune
from process_text import Dataset
import torch

ray.init()

ds = Dataset(INPUT_PATH, ['fileid','text'], 'tag')
tv_datafields = [("fileid", 'None'), ("text", 'TEXT'), ("tag", 'LABEL')]
trn_dl,vld_dl = ds.process_text(tv_datafields,'text',n_gpus=torch.device('cuda'))

args = {"dataset":ds,
"train_dl":trn_dl,
"valid_dl":vld_dl, "model_path":model_path}

def train_func(args, config, reporter):
model = mt.CNN(args['dataset'].text.vocab,dp=config['dp']).cuda()
optim = mt.get_optimizer(model,lr=config['lr'])
loss = mt.train_loop(model, config['epochs'], optim,args['train_dl'], args['valid_dl'], args["model_path"], gpu=True)
reporter(validation_loss=loss) # report metrics

tune.register_trainable("train_func",lambda cfg, rprtr: train_func(args,cfg, rprtr))

Describe the problem

When I execute tune.register_trainable, I am getting the following error
AttributeError: Can't get local attribute 'wrap_function..WrappedFunc' on

AttributeError: 'torch.dtype' object has no attribute '__module__'

### Source code / logs

AttributeError Traceback (most recent call last)
~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in save_global(self, obj, name)
917 module = sys.modules[module_name]
--> 918 obj2, parent = _getattribute(module, name)
919 except (ImportError, KeyError, AttributeError):

~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in _getattribute(obj, name)
265 raise AttributeError("Can't get local attribute {!r} on {!r}"
--> 266 .format(name, obj))
267 try:

AttributeError: Can't get local attribute 'wrap_function..WrappedFunc' on

During handling of the above exception, another exception occurred:

PicklingError Traceback (most recent call last)
~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py in save_global(self, obj, name, pack)
638 try:
--> 639 return Pickler.save_global(self, obj, name=name)
640 except Exception:

~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in save_global(self, obj, name)
921 "Can't pickle %r: it's not found as %s.%s" %
--> 922 (obj, module_name, name))
923 else:

PicklingError: Can't pickle .WrappedFunc'>: it's not found as ray.tune.trainable.wrap_function..WrappedFunc

During handling of the above exception, another exception occurred:

AttributeError Traceback (most recent call last)
in ()
----> 1 tune.register_trainable("train_func",lambda cfg, rprtr: train_func(args,cfg, rprtr))

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/tune/registry.py in register_trainable(name, trainable)
36 raise TypeError("Second argument must be convertable to Trainable",
37 trainable)
---> 38 _global_registry.register(TRAINABLE_CLASS, name, trainable)
39
40

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/tune/registry.py in register(self, category, key, value)
75 raise TuneError("Unknown category {} not among {}".format(
76 category, KNOWN_CATEGORIES))
---> 77 self._to_flush[(category, key)] = pickle.dumps(value)
78 if _internal_kv_initialized():
79 self.flush_values()

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py in dumps(obj, protocol)
879 try:
880 cp = CloudPickler(file, protocol=protocol)
--> 881 cp.dump(obj)
882 return file.getvalue()
883 finally:

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py in dump(self, obj)
266 self.inject_addons()
267 try:
--> 268 return Pickler.dump(self, obj)
269 except RuntimeError as e:
270 if 'recursion' in e.args[0]:

~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in dump(self, obj)
407 if self.proto >= 4:
408 self.framer.start_framing()
--> 409 self.save(obj)
410 self.write(STOP)
411 self.framer.end_framing()

~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
474 f = self.dispatch.get(t)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
478

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py in save_global(self, obj, name, pack)
646 typ = type(obj)
647 if typ is not obj and isinstance(obj, (type, types.ClassType)):
--> 648 return self.save_dynamic_class(obj)
649
650 raise

~/src/anaconda3/envs/skorch/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py in save_dynamic_class(self, obj)
493 # Now save the rest of obj's __dict__. Any references to obj
494 # encountered while saving will point to the skeleton class.
--> 495 save(clsdict)
496
497 # Write a tuple of (skeleton_class, clsdict).

~/src/anaconda3/envs/skorch/lib/python3.6/pickle.py in save_dict(self, obj)
819
820 self.memoize(obj)
--> 821 self._batch_setitems(obj.items())
822
823 dispatch[dict] = save_dict