Ray: [tune] torchtext pickle error

Created on 18 Feb 2020 · 7Comments · Source: ray-project/ray

The following codes run on the colab,

analysis = tune.run(train_lstm, config={
            "lr": tune.grid_search([0.0005, 0.001, 0.005]),
            "clip": tune.grid_search([1.0, 0.5, 0.1])
            }
    )

print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

where train_lstm is my own fined function using pytorch.

<ipython-input-31-c51e8c1cd47e> in <module>()
      2 analysis = tune.run(train_lstm, config={
      3             "lr": tune.grid_search([0.0005, 0.001, 0.005]),
----> 4             "clip": tune.grid_search([1.0, 0.5, 0.1])
      5             }
      6     )

5 frames
/usr/local/lib/python3.6/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, stop, config, resources_per_trial, num_samples, local_dir, upload_dir, trial_name_creator, loggers, sync_to_cloud, sync_to_driver, checkpoint_freq, checkpoint_at_end, sync_on_checkpoint, keep_checkpoints_num, checkpoint_score_attr, global_checkpoint_period, export_formats, max_failures, restore, search_alg, scheduler, with_server, server_port, verbose, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, return_trials, ray_auto_init, sync_function)
    227     for i, exp in enumerate(experiments):
    228         if not isinstance(exp, Experiment):
--> 229             run_identifier = Experiment.register_if_needed(exp)
    230             experiments[i] = Experiment(
    231                 name=name,

/usr/local/lib/python3.6/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
    210                 logger.warning(
    211                     "No name detected on trainable. Using {}.".format(name))
--> 212             register_trainable(name, run_object)
    213             return name
    214         else:

/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register_trainable(name, trainable)
     65         raise TypeError("Second argument must be convertable to Trainable",
     66                         trainable)
---> 67     _global_registry.register(TRAINABLE_CLASS, name, trainable)
     68 
     69 

/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register(self, category, key, value)
    104             raise TuneError("Unknown category {} not among {}".format(
    105                 category, KNOWN_CATEGORIES))
--> 106         self._to_flush[(category, key)] = pickle.dumps(value)
    107         if _internal_kv_initialized():
    108             self.flush_values()

/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
     66     with io.BytesIO() as file:
     67         cp = CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback)
---> 68         cp.dump(obj)
     69         return file.getvalue()
     70 

/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dump(self, obj)
    555     def dump(self, obj):
    556         try:
--> 557             return Pickler.dump(self, obj)
    558         except RuntimeError as e:
    559             if "recursion" in e.args[0]:

TypeError: 'generator' object is not callable

Can anyone figure out what is run with the pickle?

bug tune

Source

JiahaoYao

Most helpful comment

Sorry for the slow reply - I actually thought I posted a reply already :)

This is a scoping issue.
Torchtext fields and datasets need to be created within the Trainable because the Trainable needs to be pickled and replicated across multiple parallel processes.

If they are not, Ray/cloudpickle will try to serialize them and fail.

For example, you'll want to do something like:

```python
from ray import tune
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test

from nltk.util import ngrams
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
import torchtext

XXX Note that we duplicate this data loading here because we need the vocab size to create the model

download and load the data

text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
text_field.build_vocab(train_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)

train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text

import torch.optim as optim

... other code

class LSTMModel():
# ... other code
def train_on_dataset(self, dataset):
train_iterator = torchtext.data.BPTTIterator(dataset, batch_size=64,
bptt_len=32, device='cuda')
## THE REST OF THE CODE IS THE SAME AS YOUR train FUNCTION
h = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
c = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
state = ( h, c )

    optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr)
    n_epochs = 20

    validation_score_best = 300
    PATH = '/content/model.pth'


    for epoch in range(n_epochs):
        print('Epoch', epoch)
        for batch in tqdm.tqdm_notebook(train_iterator, leave=False):
            assert self.network.training, 'make sure your network is in train mode with `.train()`'

            # call zero_grad on your optimizer
            optimizer.zero_grad()

            context, word = batch.text, batch.target
            # run your network
            logits, state = self.network(context, state)

            logits = logits.view(-1, logits.shape[-1])
            word = word.view(-1,)

            # compute a loss
            loss = F.cross_entropy(logits, word)

            # call `.backward()` and `.step()` on your optimizer
            loss.backward()
            optimizer.step()

        validation_score = self.dataset_perplexity(validation_dataset)
        print('Validation score:', validation_score)

        if validation_score <= validation_score_best:
            validation_score_best = validation_score
            torch.save(self.network.state_dict(), PATH)


    self.network.load_state_dict(torch.load(PATH))

def train_net(config):
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
validation_text = validation_dataset.examples[0].text
lstm_model = LSTMModel(config['lr'], config['clip'])
lstm_model.train_on_dataset(train_dataset)
perp = lstm_model.perplexity(validation_text)
print('lstm validation perplexity:', perp)
tune.track.log(perplexity=perp)

analysis = tune.run(
train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip': tune.grid_search([0.001, 0.01])})

print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

Get a dataframe for analyzing trial results.

df = analysis.dataframe()

richardliaw on 22 Feb 2020

❤3 👍1

All 7 comments

I'm having the same issue. Are you using torchtext?

cfusting on 18 Feb 2020

Same issue here & I am using torchtext for preprocessing & batching my data.

nateewall on 18 Feb 2020

That is right, @cfusting and @nateewall. I am using torchtext!

JiahaoYao on 18 Feb 2020

@JiahaoYao can you help post a small reproducible script here?

richardliaw on 19 Feb 2020

Hi @richardliaw, here is the code, which can reproduce the bug. I think that is probably because the dataset is too large to put into the memory.

Again, thank you so much for looking into this specific problem!

My code is


# imports
from ray import tune
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test

from nltk.util import ngrams
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
import torchtext

# download and load the data
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
text_field.build_vocab(train_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)

train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text

print(validation_text[:30])


class LSTMNetwork(nn.Module):
    # a PyTorch Module that holds the neural network for your model

    def __init__(self):
        super().__init__()

        # YOUR CODE HERE
        self.lstm = nn.LSTM(128, 512, 3, dropout=0.5)
        self.dp = nn.Dropout(0.5)
        self.fc1 = nn.Linear(512, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x, state):
        """Compute the output of the network.

        Note: In the Pytorch LSTM tutorial, the state variable is named "hidden":
        https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

        The torch.nn.LSTM documentation is quite helpful:
        https://pytorch.org/docs/stable/nn.html#lstm

        x - a tensor of int64 inputs with shape (seq_len, batch)
        state - a tuple of two tensors with shape (num_layers, batch, hidden_size)
                representing the hidden state and cell state of the of the LSTM.
        returns a tuple with two elements:
          - a tensor of log probabilities with shape (seq_len, batch, vocab_size)
          - a state tuple returned by applying the LSTM.
        """

        # Note that the nn.LSTM module expects inputs with the sequence 
        # dimension before the batch by default.
        # In this case the dimensions are already in the right order, 
        # but watch out for this since sometimes people put the batch first

        # YOUR CODE HERE
        out = F.embedding(x, self.fc2.weight)
        out, state = self.lstm(out, state)
        out = self.dp(out)
        out = self.fc1(out)
        # this dropbox is crucial
        out = self.fc2(out)
        state = (state[0].detach(), state[1].detach())
        return out, state


class LSTMModel:
    "A class that wraps LSTMNetwork to handle training and evaluation."

    def __init__(self, lr, clip):
        self.network = LSTMNetwork().cuda()
        self.lr = lr 
        self.clip = clip

    def train(self):
        train_iterator = torchtext.data.BPTTIterator(train_dataset, batch_size=64, 
                                                     bptt_len=32, device='cuda')
        # Iterate over train_iterator with a for loop to get batches
        # each batch object has a .text and .target attribute with
        # token id tensors for the input and output respectively.

        # The initial state passed into the LSTM should be set to zero.

        # YOUR CODE HERE
        h = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
        c = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
        state = ( h, c )

        optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr)
        n_epochs = 20

        validation_score_best = 300
        PATH = '/content/model.pth'


        for epoch in range(n_epochs):
            print('Epoch', epoch)
            for batch in tqdm.tqdm_notebook(train_iterator, leave=False):
                assert self.network.training, 'make sure your network is in train mode with `.train()`'

                # call zero_grad on your optimizer
                optimizer.zero_grad()

                context, word = batch.text, batch.target
                # run your network
                logits, state = self.network(context, state)

                logits = logits.view(-1, logits.shape[-1])
                word = word.view(-1,)

                # compute a loss
                loss = F.cross_entropy(logits, word)

                # call `.backward()` and `.step()` on your optimizer
                loss.backward()
                optimizer.step()

            validation_score = self.dataset_perplexity(validation_dataset)
            print('Validation score:', validation_score)

            if validation_score <= validation_score_best:
                validation_score_best = validation_score
                torch.save(self.network.state_dict(), PATH)


        self.network.load_state_dict(torch.load(PATH))


    def next_word_probabilities(self, text_prefix):
        "Return a list of probabilities for each word in the vocabulary."

        prefix_token_tensor = torch.tensor(ids(text_prefix), device='cuda').view(-1, 1)
        print(prefix_token_tensor.shape)

        # YOUR CODE HERE
        # initialize hidden layers
        # refer: https://discuss.pytorch.org/t/correct-way-to-declare-hidden-and-cell-states-of-lstm/15745/3
        h = Variable(next(self.network.parameters()).data.new(3, 1, self.network.lstm.hidden_size), requires_grad=False)
        c = Variable(next(self.network.parameters()).data.new(3, 1, self.network.lstm.hidden_size), requires_grad=False)
        state = ( h, c )

        with torch.no_grad(): 
            self.network.eval()

            logits, state = self.network(prefix_token_tensor, state)
            logits = logits.squeeze()
            prob = F.softmax(logits, dim=-1)
            # is here right?
            prob = prob[-1]
            prob = prob.data.cpu().numpy()
        self.network.train()
        return prob

    def dataset_perplexity(self, torchtext_dataset):
        "Return perplexity as a float."
        # Your code should be very similar to next_word_probabilities, but
        # run in a loop over batches. Use torch.no_grad() for extra speed.

        iterator = torchtext.data.BPTTIterator(torchtext_dataset, batch_size=64, bptt_len=32, device='cuda')

        h = Variable(next(self.network.parameters()).data.new(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
        c = Variable(next(self.network.parameters()).data.new(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
        state = ( h, c )

        # YOUR CODE HERE

        nll = 0.0
        num_data = 0
        with torch.no_grad(): 
            self.network.eval()

            for batch in tqdm.tqdm_notebook(iterator, leave=False):

                context, word = batch.text, batch.target

                logits, state = self.network(context, state)

                logits = logits.view(-1, logits.shape[-1])
                word = word.view(-1,)
                num_data += word.shape[0]
                nll += F.cross_entropy(logits, word, reduction='sum')

        self.network.train()

        nll = nll / num_data
        score = torch.exp(nll)
        score = score.data.cpu().numpy()
        return score



import torch.optim as optim


def train_net(config):
    lstm_model = LSTMModel(config['lr'], config['clip'])
    lstm_model.train()
    print('lstm validation perplexity:', lstm_model.perplexity(validation_text))
    tune.track.log(mean_accuracy=lstm_model.perplexity(validation_text))


analysis = tune.run(
    train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip':  tune.grid_search([0.001, 0.01])})

print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

The bug is

<ipython-input-6-2b980252badb> in <module>()
     10 
     11 analysis = tune.run(
---> 12     train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip':  tune.grid_search([0.001, 0.01])})
     13 
     14 print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

5 frames
/usr/local/lib/python3.6/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, stop, config, resources_per_trial, num_samples, local_dir, upload_dir, trial_name_creator, loggers, sync_to_cloud, sync_to_driver, checkpoint_freq, checkpoint_at_end, sync_on_checkpoint, keep_checkpoints_num, checkpoint_score_attr, global_checkpoint_period, export_formats, max_failures, restore, search_alg, scheduler, with_server, server_port, verbose, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, return_trials, ray_auto_init, sync_function)
    227     for i, exp in enumerate(experiments):
    228         if not isinstance(exp, Experiment):
--> 229             run_identifier = Experiment.register_if_needed(exp)
    230             experiments[i] = Experiment(
    231                 name=name,

/usr/local/lib/python3.6/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
    210                 logger.warning(
    211                     "No name detected on trainable. Using {}.".format(name))
--> 212             register_trainable(name, run_object)
    213             return name
    214         else:

/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register_trainable(name, trainable)
     65         raise TypeError("Second argument must be convertable to Trainable",
     66                         trainable)
---> 67     _global_registry.register(TRAINABLE_CLASS, name, trainable)
     68 
     69 

/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register(self, category, key, value)
    104             raise TuneError("Unknown category {} not among {}".format(
    105                 category, KNOWN_CATEGORIES))
--> 106         self._to_flush[(category, key)] = pickle.dumps(value)
    107         if _internal_kv_initialized():
    108             self.flush_values()

/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
     66     with io.BytesIO() as file:
     67         cp = CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback)
---> 68         cp.dump(obj)
     69         return file.getvalue()
     70 

/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dump(self, obj)
    555     def dump(self, obj):
    556         try:
--> 557             return Pickler.dump(self, obj)
    558         except RuntimeError as e:
    559             if "recursion" in e.args[0]:

TypeError: 'generator' object is not callable

JiahaoYao on 19 Feb 2020