The following codes run on the colab,
analysis = tune.run(train_lstm, config={
"lr": tune.grid_search([0.0005, 0.001, 0.005]),
"clip": tune.grid_search([1.0, 0.5, 0.1])
}
)
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
# Get a dataframe for analyzing trial results.
df = analysis.dataframe()
where train_lstm is my own fined function using pytorch.
<ipython-input-31-c51e8c1cd47e> in <module>()
2 analysis = tune.run(train_lstm, config={
3 "lr": tune.grid_search([0.0005, 0.001, 0.005]),
----> 4 "clip": tune.grid_search([1.0, 0.5, 0.1])
5 }
6 )
5 frames
/usr/local/lib/python3.6/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, stop, config, resources_per_trial, num_samples, local_dir, upload_dir, trial_name_creator, loggers, sync_to_cloud, sync_to_driver, checkpoint_freq, checkpoint_at_end, sync_on_checkpoint, keep_checkpoints_num, checkpoint_score_attr, global_checkpoint_period, export_formats, max_failures, restore, search_alg, scheduler, with_server, server_port, verbose, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, return_trials, ray_auto_init, sync_function)
227 for i, exp in enumerate(experiments):
228 if not isinstance(exp, Experiment):
--> 229 run_identifier = Experiment.register_if_needed(exp)
230 experiments[i] = Experiment(
231 name=name,
/usr/local/lib/python3.6/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
210 logger.warning(
211 "No name detected on trainable. Using {}.".format(name))
--> 212 register_trainable(name, run_object)
213 return name
214 else:
/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register_trainable(name, trainable)
65 raise TypeError("Second argument must be convertable to Trainable",
66 trainable)
---> 67 _global_registry.register(TRAINABLE_CLASS, name, trainable)
68
69
/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register(self, category, key, value)
104 raise TuneError("Unknown category {} not among {}".format(
105 category, KNOWN_CATEGORIES))
--> 106 self._to_flush[(category, key)] = pickle.dumps(value)
107 if _internal_kv_initialized():
108 self.flush_values()
/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
66 with io.BytesIO() as file:
67 cp = CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback)
---> 68 cp.dump(obj)
69 return file.getvalue()
70
/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dump(self, obj)
555 def dump(self, obj):
556 try:
--> 557 return Pickler.dump(self, obj)
558 except RuntimeError as e:
559 if "recursion" in e.args[0]:
TypeError: 'generator' object is not callable
Can anyone figure out what is run with the pickle?
I'm having the same issue. Are you using torchtext?
Same issue here & I am using torchtext for preprocessing & batching my data.
That is right, @cfusting and @nateewall. I am using torchtext!
@JiahaoYao can you help post a small reproducible script here?
Hi @richardliaw, here is the code, which can reproduce the bug. I think that is probably because the dataset is too large to put into the memory.
Again, thank you so much for looking into this specific problem!
My code is
# imports
from ray import tune
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test
from nltk.util import ngrams
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
import torchtext
# download and load the data
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
text_field.build_vocab(train_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)
train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text
print(validation_text[:30])
class LSTMNetwork(nn.Module):
# a PyTorch Module that holds the neural network for your model
def __init__(self):
super().__init__()
# YOUR CODE HERE
self.lstm = nn.LSTM(128, 512, 3, dropout=0.5)
self.dp = nn.Dropout(0.5)
self.fc1 = nn.Linear(512, 128)
self.fc2 = nn.Linear(128, vocab_size)
def forward(self, x, state):
"""Compute the output of the network.
Note: In the Pytorch LSTM tutorial, the state variable is named "hidden":
https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
The torch.nn.LSTM documentation is quite helpful:
https://pytorch.org/docs/stable/nn.html#lstm
x - a tensor of int64 inputs with shape (seq_len, batch)
state - a tuple of two tensors with shape (num_layers, batch, hidden_size)
representing the hidden state and cell state of the of the LSTM.
returns a tuple with two elements:
- a tensor of log probabilities with shape (seq_len, batch, vocab_size)
- a state tuple returned by applying the LSTM.
"""
# Note that the nn.LSTM module expects inputs with the sequence
# dimension before the batch by default.
# In this case the dimensions are already in the right order,
# but watch out for this since sometimes people put the batch first
# YOUR CODE HERE
out = F.embedding(x, self.fc2.weight)
out, state = self.lstm(out, state)
out = self.dp(out)
out = self.fc1(out)
# this dropbox is crucial
out = self.fc2(out)
state = (state[0].detach(), state[1].detach())
return out, state
class LSTMModel:
"A class that wraps LSTMNetwork to handle training and evaluation."
def __init__(self, lr, clip):
self.network = LSTMNetwork().cuda()
self.lr = lr
self.clip = clip
def train(self):
train_iterator = torchtext.data.BPTTIterator(train_dataset, batch_size=64,
bptt_len=32, device='cuda')
# Iterate over train_iterator with a for loop to get batches
# each batch object has a .text and .target attribute with
# token id tensors for the input and output respectively.
# The initial state passed into the LSTM should be set to zero.
# YOUR CODE HERE
h = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
c = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
state = ( h, c )
optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr)
n_epochs = 20
validation_score_best = 300
PATH = '/content/model.pth'
for epoch in range(n_epochs):
print('Epoch', epoch)
for batch in tqdm.tqdm_notebook(train_iterator, leave=False):
assert self.network.training, 'make sure your network is in train mode with `.train()`'
# call zero_grad on your optimizer
optimizer.zero_grad()
context, word = batch.text, batch.target
# run your network
logits, state = self.network(context, state)
logits = logits.view(-1, logits.shape[-1])
word = word.view(-1,)
# compute a loss
loss = F.cross_entropy(logits, word)
# call `.backward()` and `.step()` on your optimizer
loss.backward()
optimizer.step()
validation_score = self.dataset_perplexity(validation_dataset)
print('Validation score:', validation_score)
if validation_score <= validation_score_best:
validation_score_best = validation_score
torch.save(self.network.state_dict(), PATH)
self.network.load_state_dict(torch.load(PATH))
def next_word_probabilities(self, text_prefix):
"Return a list of probabilities for each word in the vocabulary."
prefix_token_tensor = torch.tensor(ids(text_prefix), device='cuda').view(-1, 1)
print(prefix_token_tensor.shape)
# YOUR CODE HERE
# initialize hidden layers
# refer: https://discuss.pytorch.org/t/correct-way-to-declare-hidden-and-cell-states-of-lstm/15745/3
h = Variable(next(self.network.parameters()).data.new(3, 1, self.network.lstm.hidden_size), requires_grad=False)
c = Variable(next(self.network.parameters()).data.new(3, 1, self.network.lstm.hidden_size), requires_grad=False)
state = ( h, c )
with torch.no_grad():
self.network.eval()
logits, state = self.network(prefix_token_tensor, state)
logits = logits.squeeze()
prob = F.softmax(logits, dim=-1)
# is here right?
prob = prob[-1]
prob = prob.data.cpu().numpy()
self.network.train()
return prob
def dataset_perplexity(self, torchtext_dataset):
"Return perplexity as a float."
# Your code should be very similar to next_word_probabilities, but
# run in a loop over batches. Use torch.no_grad() for extra speed.
iterator = torchtext.data.BPTTIterator(torchtext_dataset, batch_size=64, bptt_len=32, device='cuda')
h = Variable(next(self.network.parameters()).data.new(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
c = Variable(next(self.network.parameters()).data.new(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
state = ( h, c )
# YOUR CODE HERE
nll = 0.0
num_data = 0
with torch.no_grad():
self.network.eval()
for batch in tqdm.tqdm_notebook(iterator, leave=False):
context, word = batch.text, batch.target
logits, state = self.network(context, state)
logits = logits.view(-1, logits.shape[-1])
word = word.view(-1,)
num_data += word.shape[0]
nll += F.cross_entropy(logits, word, reduction='sum')
self.network.train()
nll = nll / num_data
score = torch.exp(nll)
score = score.data.cpu().numpy()
return score
import torch.optim as optim
def train_net(config):
lstm_model = LSTMModel(config['lr'], config['clip'])
lstm_model.train()
print('lstm validation perplexity:', lstm_model.perplexity(validation_text))
tune.track.log(mean_accuracy=lstm_model.perplexity(validation_text))
analysis = tune.run(
train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip': tune.grid_search([0.001, 0.01])})
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
# Get a dataframe for analyzing trial results.
df = analysis.dataframe()
The bug is
<ipython-input-6-2b980252badb> in <module>()
10
11 analysis = tune.run(
---> 12 train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip': tune.grid_search([0.001, 0.01])})
13
14 print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
5 frames
/usr/local/lib/python3.6/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, stop, config, resources_per_trial, num_samples, local_dir, upload_dir, trial_name_creator, loggers, sync_to_cloud, sync_to_driver, checkpoint_freq, checkpoint_at_end, sync_on_checkpoint, keep_checkpoints_num, checkpoint_score_attr, global_checkpoint_period, export_formats, max_failures, restore, search_alg, scheduler, with_server, server_port, verbose, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, return_trials, ray_auto_init, sync_function)
227 for i, exp in enumerate(experiments):
228 if not isinstance(exp, Experiment):
--> 229 run_identifier = Experiment.register_if_needed(exp)
230 experiments[i] = Experiment(
231 name=name,
/usr/local/lib/python3.6/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
210 logger.warning(
211 "No name detected on trainable. Using {}.".format(name))
--> 212 register_trainable(name, run_object)
213 return name
214 else:
/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register_trainable(name, trainable)
65 raise TypeError("Second argument must be convertable to Trainable",
66 trainable)
---> 67 _global_registry.register(TRAINABLE_CLASS, name, trainable)
68
69
/usr/local/lib/python3.6/dist-packages/ray/tune/registry.py in register(self, category, key, value)
104 raise TuneError("Unknown category {} not among {}".format(
105 category, KNOWN_CATEGORIES))
--> 106 self._to_flush[(category, key)] = pickle.dumps(value)
107 if _internal_kv_initialized():
108 self.flush_values()
/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
66 with io.BytesIO() as file:
67 cp = CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback)
---> 68 cp.dump(obj)
69 return file.getvalue()
70
/usr/local/lib/python3.6/dist-packages/ray/cloudpickle/cloudpickle_fast.py in dump(self, obj)
555 def dump(self, obj):
556 try:
--> 557 return Pickler.dump(self, obj)
558 except RuntimeError as e:
559 if "recursion" in e.args[0]:
TypeError: 'generator' object is not callable
Sorry for the slow reply - I actually thought I posted a reply already :)
This is a scoping issue.
Torchtext fields and datasets need to be created within the Trainable because the Trainable needs to be pickled and replicated across multiple parallel processes.
If they are not, Ray/cloudpickle will try to serialize them and fail.
For example, you'll want to do something like:
```python
from ray import tune
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test
from nltk.util import ngrams
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
import torchtext
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
text_field.build_vocab(train_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)
train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text
import torch.optim as optim
class LSTMModel():
# ... other code
def train_on_dataset(self, dataset):
train_iterator = torchtext.data.BPTTIterator(dataset, batch_size=64,
bptt_len=32, device='cuda')
## THE REST OF THE CODE IS THE SAME AS YOUR train FUNCTION
h = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
c = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
state = ( h, c )
optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr)
n_epochs = 20
validation_score_best = 300
PATH = '/content/model.pth'
for epoch in range(n_epochs):
print('Epoch', epoch)
for batch in tqdm.tqdm_notebook(train_iterator, leave=False):
assert self.network.training, 'make sure your network is in train mode with `.train()`'
# call zero_grad on your optimizer
optimizer.zero_grad()
context, word = batch.text, batch.target
# run your network
logits, state = self.network(context, state)
logits = logits.view(-1, logits.shape[-1])
word = word.view(-1,)
# compute a loss
loss = F.cross_entropy(logits, word)
# call `.backward()` and `.step()` on your optimizer
loss.backward()
optimizer.step()
validation_score = self.dataset_perplexity(validation_dataset)
print('Validation score:', validation_score)
if validation_score <= validation_score_best:
validation_score_best = validation_score
torch.save(self.network.state_dict(), PATH)
self.network.load_state_dict(torch.load(PATH))
def train_net(config):
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
validation_text = validation_dataset.examples[0].text
lstm_model = LSTMModel(config['lr'], config['clip'])
lstm_model.train_on_dataset(train_dataset)
perp = lstm_model.perplexity(validation_text)
print('lstm validation perplexity:', perp)
tune.track.log(perplexity=perp)
analysis = tune.run(
train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip': tune.grid_search([0.001, 0.01])})
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
df = analysis.dataframe()
Thanks @richardliaw , and I will close #7202
Most helpful comment
Sorry for the slow reply - I actually thought I posted a reply already :)
This is a scoping issue.
Torchtext fields and datasets need to be created within the Trainable because the Trainable needs to be pickled and replicated across multiple parallel processes.
If they are not, Ray/cloudpickle will try to serialize them and fail.
For example, you'll want to do something like:
```python
from ray import tune
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test
from nltk.util import ngrams
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
import torchtext
XXX Note that we duplicate this data loading here because we need the vocab size to create the model
download and load the data
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
text_field.build_vocab(train_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)
train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text
import torch.optim as optim
... other code
class LSTMModel():
# ... other code
def train_on_dataset(self, dataset):
train_iterator = torchtext.data.BPTTIterator(dataset, batch_size=64,
bptt_len=32, device='cuda')
## THE REST OF THE CODE IS THE SAME AS YOUR
trainFUNCTIONh = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
c = Variable(torch.zeros(3, 64, self.network.lstm.hidden_size), requires_grad=False).cuda()
state = ( h, c )
def train_net(config):
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets
validation_text = validation_dataset.examples[0].text
lstm_model = LSTMModel(config['lr'], config['clip'])
lstm_model.train_on_dataset(train_dataset)
perp = lstm_model.perplexity(validation_text)
print('lstm validation perplexity:', perp)
tune.track.log(perplexity=perp)
analysis = tune.run(
train_net, config={"lr": tune.grid_search([0.001, 0.01]), 'clip': tune.grid_search([0.001, 0.01])})
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
Get a dataframe for analyzing trial results.
df = analysis.dataframe()