Steps to reproduce the behavior:
model = TestModel()
trainer = pl.Trainer(gpus=1, default_save_path=exp_path, max_epochs=100)
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.lr)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, 'min')
return [optim], [sched]
# Run learning rate finder
lr_finder = trainer.lr_find(model)
# Results can be found in
lr_finder.results
# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()
The following returns consistently:
optimizer got an empty parameter list
The regular .fit method works as expected.
PL version: '0.7.6'
Hi! thanks for your contribution!, great first issue!
@SkafteNicki mind have look? :rabbit:
I am not sure this has anything to do with the learning rate finder. The error:
optimizer got an empty parameter list
usually indicates that self.parameters() is empty, see this link for example.
Could you provide the full model (or a colab notebook) so I can look through it?
It's a bit hard to prepare the colab, so here's an end to end model that's exactly what i have just using a toy dataset.
As an aside question, is it possible to chain the schedulers and have one take precedence, so for instance have a cosine annealing scheduler by default but apply LR reduce still or just call fit multiple times and in between change the lr parameter or any parameter really by hand ?
class Normalize(nn.Module):
def __init__(self, mean, std):
super(Normalize, self).__init__()
self.mean = mean
self.std = std
assert self.std.min() > 0
def forward(self, x):
return (x-self.mean)/self.std
class Swish(nn.Module):
def forward(self, input_tensor):
return input_tensor * torch.sigmoid(input_tensor)
class Mish(nn.Module):
def forward(self, input_tensor):
return input_tensor * torch.tanh(F.softplus(input_tensor))
def sigmoid_range(x, low, high):
"Sigmoid function with range `(low, high)`"
return torch.sigmoid(x) * (high - low) + low
def median_absolute_percentage_error(y_pred, y):
e = torch.abs(y.view_as(y_pred) - y_pred) / torch.abs(y.view_as(y_pred))
return 100.0 * torch.median(e).item()
def r2_score(y_pred, y):
"R2 score (coefficient of determination) between `pred` and `targ`."
u = torch.sum((y - y_pred) ** 2)
d = torch.sum((y - y.mean()) ** 2)
return 1 - u / d
class TestModel(pl.LightningModule):
def __init__(self, val_pct: float = 0.1, lr = 0.001,
seed: int=31, batch_size: int = 2048, scaled=True, enable_batch_norm=False):
super().__init__()
self.lr = lr
self.val_pct: float = val_pct
self.seed: int = seed
self.batch_size: int = batch_size
self.data_raw: pd.DataFrame = None
self.X_train_raw: torch.Tensor = None
self.X_train: torch.Tensor = None
self.X_valid_raw: torch.Tensor = None
self.X_valid: torch.Tensor = None
self.Y_train: torch.Tensor = None
self.Y_valid: torch.Tensor = None
self.X_names: List[str] = []
self.Y_names: List[str] = []
self.train_dataset: Dataset = None
self.valid_dataset: Dataset = None
self.train_idx = []
self.valid_idx = []
self.tfms: transforms.Compose = None
self.scaled = scaled
self.enable_bn = enable_batch_norm
self.batch_norm = None
def _prepare_model(self, feat_n):
if self.enable_bn:
self.batch_norm = nn.BatchNorm1d(feat_n)
L.info('Using batchnorm')
#act = nn.Tanh()
#act = Mish()
act = nn.ReLU()
self.layer1 = nn.Linear(feat_n,1000)
self.layer2 = nn.Linear(1000,500)
self.layer3 = nn.Linear(500,250)
self.layer4 = nn.Linear(250,1)
#Set bias to mean of target
if self.mean_target is not None: self.layer4.bias.data.fill_(self.mean_target)
self.layers = nn.Sequential(
self.layer1,
act,
self.layer2,
act,
self.layer3,
act,
self.layer4
)
def prepare_data(self):
# Feature engineer and convert to tensors
X, self.X_names, Y, self.Y_names = np.random.rand(100000,3), ['x1','x2','x3'], np.random.rand(3,1), ['y1']
# Train, valid split
self.train_idx, self.valid_idx= train_test_split(np.arange(len(Y)), test_size=self.val_pct, random_state=self.seed, shuffle=True)
L.info(f'Using {len(self.train_idx)} train samples and {len(self.valid_idx)} valid samples for valuing {self.payoff.name} payoff')
L.info(f'Features: \n{self.X_names}, \n Targets: \n{self.Y_names}')
self.X_train = X[self.train_idx]
self.Y_train = Y[self.train_idx]
self.X_valid = X[self.valid_idx]
self.Y_valid = Y[self.valid_idx]
L.info(f'X_train shape: {self.X_train.shape}, Y_train shape: {self.Y_train.shape}')
L.info(f'X_train means: {self.X_train.mean(axis=0)}, \n X_train stds: {self.X_train.std(axis=0)}')
L.info(f'X_valid means: {self.X_valid.mean(axis=0)}, \n X_valid stds: {self.X_valid.std(axis=0)}')
# Normalize and transform
self.tfms = Normalize(self.X_train.mean(axis=0), self.X_train.std(axis=0))
L.info(f'Transforms: {self.tfms}')
self.X_train = self.tfms(self.X_train)
self.X_valid = self.tfms(self.X_valid)
L.info(f'X_train means post tfms: {self.X_train.mean(axis=0)},\n X_valid means: {self.X_valid.mean(axis=0)}')
# Construct datasets
self.train_dataset = TensorDataset(self.X_train, self.Y_train)
self.valid_dataset = TensorDataset(self.X_valid, self.Y_valid)
# Initialize model with scaling properties, mean target is the bias of the last linear layer
if self.scaled:
self.y_min = self.Y_train.min()*1.01
self.y_max = self.Y_train.max()*1.01
self.mean_target = self.Y_train.mean().item()
assert self.y_min != np.nan
assert self.y_max != np.nan
L.info(f'Using scaling of outputs: y_min: {self.y_min}, y_max: {self.y_max}. y_pred_bias: {self.mean_target}')
self._prepare_model(len(self.X_names))
def forward(self, x):
if self.batch_norm is not None: x = self.batch_norm(x)
if self.scaled:
return sigmoid_range(self.layers(x), self.y_min, self.y_max)
else:
return self.layers(x)
def step(self, y_hat, y, mode='train'):
loss = F.mse_loss(y_hat, y) * 10e5
mae = F.l1_loss(y_hat, y)
mape = median_absolute_percentage_error(y_hat, y)
r2 = r2_score(y_hat, y)
out = {'loss': loss, 'mae': mae, 'mape': mape, 'R2': r2}
if mode=='train':
out['log'] = out.copy()
return out
elif mode =='val':
out = {f'{mode}_{k}': v for k,v in out.items()}
out['log'] = out.copy()
return out
else:
raise Exception('Unsupported mode')
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
return self.step(y_hat, y, 'train')
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.lr)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, 'min')
return [optim], [sched]
def train_dataloader(self):
loader = DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)
return loader
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
return self.step(y_hat, y, 'val')
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'val_loss': avg_loss, 'log': tensorboard_logs}
def val_dataloader(self):
loader = DataLoader(self.valid_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)
return loader
model = TestModel()
trainer = pl.Trainer(gpus=1, default_save_path=exp_path, max_epochs=100)
# Run learning rate finder
lr_finder = trainer.lr_find(model)
# Results can be found in
lr_finder.results
# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()
44 param_groups = list(params)
45 if len(param_groups) == 0:
---> 46 raise ValueError("optimizer got an empty parameter list")
47 if not isinstance(param_groups[0], dict):
48 param_groups = [{'params': param_groups}]
ValueError: optimizer got an empty parameter list
I have a feeling that maybe lr_finder isn't calling prepare_data and hence _prepare_model so all the nn layers are None?
lr_finder does call prepare_data, however it expects (implicit) that the model parameters are initialized before. The intention behind prepare_data is only to setup/download whatever data the model is going to use, not setup the model parameters.
Your setup is very atypical, normally model parameters are always initialized in the __init__(...) method. I would suggest that you move the call to self._prepare_model(self, feat_n) to the bottom of __init__(...).
@SkafteNicki What if the model is data dependend and we want to keep that dependence integrated (instead of passing in data dependent parameters to the constructor). In my case this is the bias term of the last layer, which is set to the mean of the training set target. This is only available after the training set is loaded and determined. In these situations do you usually advice to move the training data loader outside of the model and use the production recommended approach?
I would definitely go for the production approach in your case. My take on your user case is that you want to set you bias term to a constant. Even though you choose that constant in a very specific way (as the mean of training set target), it boils down to being a hyper parameter of the model and thus should be a value given to the __init__(...) method.
All that said, I am willing to fix this in a PR, I am just trying to figure out if this is unique case or a more broader observation.
Well I guess it's a question of when does the model specification happen, before you handle your data or after. I think that there are plenty of use cases in which model parameters are set as a function of the data (min,max,biases,etc). So it's fine to say this is not supported with the prepare_data mechanism explicitly - but still the lr_find and fit methods should have consistent behaviour (and if there's no good reason to require the bias to be initialized it should probably be allowed). That's my personal feedback, what's your take on it? Im happy to use the alternative approach and process all data outside the PL module, but again seems like it breaks self containment and isolation just to make the lr finder happy. Alternatively I can set up the model in __init__ so parameters won't be empty and then add the tweaks to the layers later in prepare_data, my only issue with that is, now we define the model in a bunch of places and it's somewhat error prone.
Normally I would say that model specification and handling of data, should be independent of each other. However, I do see your point that you may want to adjust model according to data.
I agree that the behavior of the fit and lr_find method should be the same. I will include this change in PR #1998, which is a major rework of the learning rate finder.
Thanks for feedback 馃憤
Hi, I'm falling on the same kind of problem here. What would be the production approach?
@Molaire Don't use prepare_data and instead call fit and lr_find with a dataloader parameter that you've processed and initialized outside of the model (I actually like to put a static method in the model for it just to keep it tidy).
@Molaire Don't use
prepare_dataand instead callfitandlr_findwith a dataloader parameter that you've processed and initialized outside of the model (I actually like to put a static method in the model for it just to keep it tidy).
Do you have some code example?
Normally I would say that model specification and handling of data, should be independent of each other.
However, I do see your point that you may want to adjust model according to data.
+1 for a way to handle this use case.
Mine is a generic supervised classification model for which the number of classes is not known before loading the data.
I ended up calling prepare_data in __init__ and defining the output dimension of the final nn.Linear layer according to the outcome of prepare_data.
@feribg can we close this issue? mind checking again on master?
Yep, thanks!
Most helpful comment
@Molaire Don't use
prepare_dataand instead callfitandlr_findwith a dataloader parameter that you've processed and initialized outside of the model (I actually like to put a static method in the model for it just to keep it tidy).