I am new to ray and tune. I want to use PBT with a dynamic config. For example,
def explore(config):
# here config['varA'] = None not something as in TrainMNIST._setup
return config
class TrainMNIST(Trainable):
def _setup(self, config):
config['varA'] = something
self.config.update(config)
pbt = PopulationBasedTraining(
...,
custom_explore_fn=explore)
tune.run(
TrainMNIST,
name="exp",
scheduler=pbt,
stop={
"test_acc": 0.99,
"training_iteration": 100,
},
resources_per_trial={
"cpu": 2,
"gpu": 0.25,
},
**{"config":
{ "args": vars(args),
"varA": None,
}
}
How can I explore with config['varA'] = something?
Yes, you can do anything to the config in the explore function. I don't think changing it in setup makes sense though.
Yes, you can do anything to the config in the explore function. I don't think changing it in setup makes sense though.
Sorry for the misleading. I meant that config['varA']=None when passing into tune.run. Then, I change config['varA']=1 in TrainMNIST._setup. I want to explore with config['varA']=1. However, in explore I get
config['varA']=None.
MNIST_ROOT = path/to/mnist
LOCAL_DIR = path/to/results
perturbation_interval = 1
def explore(config):
"""Custom explore function.
Args:
config: dictionary containing ray config params.
Returns:
Copy of config with modified augmentation policy.
"""
print(config)
return config
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
self.fc = nn.Linear(192, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 3))
x = x.view(-1, 192)
x = self.fc(x)
return F.log_softmax(x, dim=1)
class TrainMNIST(Trainable):
def _setup(self, config):
args = config.pop("args")
args.update(config)
self.is_cuda = torch.cuda.is_available()
torch.manual_seed(args['seed'])
if self.is_cuda:
torch.cuda.manual_seed(args['seed'])
self.kwargs = {"num_workers": 4, "pin_memory": True} if self.is_cuda else {}
mnist_transforms = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])
self.trainset = datasets.MNIST(MNIST_ROOT, train=True, download=False, transform=mnist_transforms)
self.testset = datasets.MNIST(MNIST_ROOT, train=False, download=False, transform=mnist_transforms)
self.train_loader = torch.utils.data.DataLoader(self.trainset, batch_size=256, shuffle=True, **self.kwargs)
self.test_loader = torch.utils.data.DataLoader(self.testset, batch_size=256, shuffle=True, **self.kwargs)
self.model = Net()
if self.is_cuda:
self.model.cuda()
self.optimizer = optim.SGD(self.model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
self.scheduler = MultiStepLR(self.optimizer, milestones=[30, 60, 90], gamma=0.2)
self.args = args
if config['a'] is None:
config['a'] = 1111111
self.config.update(config)
def _train_iteration(self):
self.model.train()
for batch_idx, (data, target) in enumerate(self.train_loader):
if self.is_cuda:
data, target = data.cuda(), target.cuda()
self.optimizer.zero_grad()
output = self.model(data)
loss = F.nll_loss(output, target)
loss.backward()
self.optimizer.step()
def _test(self):
self.model.eval()
correct = 0
with torch.no_grad():
for data, target in self.test_loader:
if self.is_cuda:
data, target = data.cuda(), target.cuda()
output = self.model(data)
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(
target.data.view_as(pred)).long().cpu().sum()
accuracy = correct.item() / len(self.test_loader.dataset)
return {"test_acc": accuracy}
def _train(self):
self._train_iteration()
self.scheduler.step(epoch=self._iteration)
return self._test()
def _save(self, checkpoint_dir):
checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
torch.save(self.model.state_dict(), checkpoint_path)
return checkpoint_path
def _restore(self, checkpoint_path):
self.model.load_state_dict(torch.load(checkpoint_path))
def reset_config(self, new_config):
self.config.update(new_config)
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
parser.add_argument(
"--ray-redis-address",
help="Address of Ray cluster for seamless distributed execution.")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
args = parser.parse_args()
if args.ray_redis_address:
ray.init(redis_address=args.ray_redis_address)
pbt = PopulationBasedTraining(
time_attr="training_iteration",
reward_attr="test_acc",
perturbation_interval=perturbation_interval,
custom_explore_fn=explore,
log_config=True)
tune.run(
TrainMNIST,
name="exp",
scheduler=pbt,
stop={
"test_acc": 0.95,
"training_iteration": 100,
},
resources_per_trial={
"cpu": 2,
"gpu": 0.25,
},
reuse_actors =True,
num_samples = 2,
checkpoint_freq = 0,
local_dir = LOCAL_DIR,
**{"config":
{ "args": vars(args),
"a": None,
}
}
)
Hm, I think this is to be expected given how PBT internally tracks the config. The copy of the config it gives your trial is not connected to the original since the trial may be running on a different machine. Why not set a: 1 in the experiment config instead?
I found a way to work aound this by adding trial.config.update(result['config']) in on_trial_result. I use this to make the explore function adapt to each model's performance. Say, for better performance I use small step for hyperparams.
Still, I think synchronize config is essential for robustness.
def on_trial_result(self, trial_runner, trial, result):
trial.config.update(result['config'])
...
update config when getting results
Most helpful comment
I found a way to work aound this by adding trial.config.update(result['config']) in on_trial_result. I use this to make the explore function adapt to each model's performance. Say, for better performance I use small step for hyperparams.
Still, I think synchronize config is essential for robustness.