Ray: PBT update config in custom explore function

Created on 28 Jun 2019 · 5Comments · Source: ray-project/ray

I am new to ray and tune. I want to use PBT with a dynamic config. For example,

def explore(config):
    # here config['varA'] = None not something as in TrainMNIST._setup
    return config

class TrainMNIST(Trainable):
    def _setup(self, config):
        config['varA'] = something
        self.config.update(config)

pbt = PopulationBasedTraining(
        ...,        
        custom_explore_fn=explore)

tune.run(
        TrainMNIST,
        name="exp",
        scheduler=pbt,
        stop={
            "test_acc": 0.99,
            "training_iteration": 100,
        },
        resources_per_trial={
            "cpu": 2,
            "gpu": 0.25,
        },
        **{"config":
            { "args": vars(args),
              "varA": None,
              }
        }

How can I explore with config['varA'] = something?

question

Source

heurainbow

Most helpful comment

I found a way to work aound this by adding trial.config.update(result['config']) in on_trial_result. I use this to make the explore function adapt to each model's performance. Say, for better performance I use small step for hyperparams.

Still, I think synchronize config is essential for robustness.

    def on_trial_result(self, trial_runner, trial, result):
        trial.config.update(result['config'])
        ...

heurainbow on 29 Jun 2019

👍2

All 5 comments

Yes, you can do anything to the config in the explore function. I don't think changing it in setup makes sense though.

ericl on 28 Jun 2019

Yes, you can do anything to the config in the explore function. I don't think changing it in setup makes sense though.

Sorry for the misleading. I meant that config['varA']=None when passing into tune.run. Then, I change config['varA']=1 in TrainMNIST._setup. I want to explore with config['varA']=1. However, in explore I get
config['varA']=None.

MNIST_ROOT = path/to/mnist
LOCAL_DIR = path/to/results
perturbation_interval = 1


def explore(config):
    """Custom explore function.

    Args:
      config: dictionary containing ray config params.

    Returns:
      Copy of config with modified augmentation policy.
    """
    print(config)
    return config


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


class TrainMNIST(Trainable):
    def _setup(self, config):
        args = config.pop("args")
        args.update(config)

        self.is_cuda = torch.cuda.is_available()
        torch.manual_seed(args['seed'])
        if self.is_cuda:
            torch.cuda.manual_seed(args['seed'])

        self.kwargs = {"num_workers": 4, "pin_memory": True} if self.is_cuda else {}
        mnist_transforms = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize((0.1307,), (0.3081,))])

        self.trainset = datasets.MNIST(MNIST_ROOT, train=True, download=False, transform=mnist_transforms)
        self.testset = datasets.MNIST(MNIST_ROOT, train=False, download=False, transform=mnist_transforms)
        self.train_loader = torch.utils.data.DataLoader(self.trainset, batch_size=256, shuffle=True, **self.kwargs)
        self.test_loader = torch.utils.data.DataLoader(self.testset, batch_size=256, shuffle=True, **self.kwargs)

        self.model = Net()
        if self.is_cuda:
            self.model.cuda()

        self.optimizer = optim.SGD(self.model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
        self.scheduler = MultiStepLR(self.optimizer, milestones=[30, 60, 90], gamma=0.2)
        self.args = args

        if config['a'] is None:
            config['a'] = 1111111
            self.config.update(config)

    def _train_iteration(self):
        self.model.train()
        for batch_idx, (data, target) in enumerate(self.train_loader):
            if self.is_cuda:
                data, target = data.cuda(), target.cuda()
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            self.optimizer.step()

    def _test(self):
        self.model.eval()
        correct = 0
        with torch.no_grad():
            for data, target in self.test_loader:
                if self.is_cuda:
                    data, target = data.cuda(), target.cuda()
                output = self.model(data)
                # get the index of the max log-probability
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(
                    target.data.view_as(pred)).long().cpu().sum()

        accuracy = correct.item() / len(self.test_loader.dataset)
        return {"test_acc": accuracy}

    def _train(self):
        self._train_iteration()
        self.scheduler.step(epoch=self._iteration)
        return self._test()

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        torch.save(self.model.state_dict(), checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.model.load_state_dict(torch.load(checkpoint_path))

    def reset_config(self, new_config):
        self.config.update(new_config)
        return True

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--ray-redis-address",
        help="Address of Ray cluster for seamless distributed execution.")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")

    args = parser.parse_args()
    if args.ray_redis_address:
        ray.init(redis_address=args.ray_redis_address)

    pbt = PopulationBasedTraining(
        time_attr="training_iteration",
        reward_attr="test_acc",
        perturbation_interval=perturbation_interval,
        custom_explore_fn=explore,
        log_config=True)

    tune.run(
        TrainMNIST,
        name="exp",
        scheduler=pbt,
        stop={
            "test_acc": 0.95,
            "training_iteration": 100,
        },
        resources_per_trial={
            "cpu": 2,
            "gpu": 0.25,
        },
        reuse_actors =True,
        num_samples = 2,
        checkpoint_freq = 0,
        local_dir = LOCAL_DIR,
        **{"config":
            { "args": vars(args),
              "a": None,
              }
        }
    )

heurainbow on 29 Jun 2019

Hm, I think this is to be expected given how PBT internally tracks the config. The copy of the config it gives your trial is not connected to the original since the trial may be running on a different machine. Why not set a: 1 in the experiment config instead?

ericl on 29 Jun 2019

Still, I think synchronize config is essential for robustness.

    def on_trial_result(self, trial_runner, trial, result):
        trial.config.update(result['config'])
        ...

heurainbow on 29 Jun 2019

👍2

update config when getting results

heurainbow on 29 Jun 2019

Was this page helpful?

0 / 5 - 0 ratings

Related issues

RayWorkerError: The worker died unexpectedly while executing this task.

devin-petersohn · 35Comments

Poor spillback scheduling behavior in large clusters

mattearllongshot · 33Comments

Unable to start ray dashboard on red hat

manishagarwal23 · 32Comments

Local cluster YAML no longer working in 0.9.0.dev0

arsedler9 · 35Comments

[rllib] [Feature Request] PyTorch version of DQN style algorithms

ankeshanand · 31Comments