I'm having some issues using DataParallel with the reformer model with 4 GPUs. I am trying to feed the ReformerModel input embeddings, and output the last hidden state. I am using apex amp, however I get the same error when I don't use amp. I also get the same error when I use input IDs, rather than embeddings. And I've tested the same script using other HuggingFace models with no issues (Bert, and Roberta).
Simple code:
import torch
from apex import amp
import transformers
from transformers import ReformerModel
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
print(transformers.__version__)
print(torch.__version__)
device = torch.device("cuda:0")
batch_size = 4
model_rf = ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
model_rf.to(device)
opt_rf = torch.optim.AdamW(model_rf.parameters(), lr=0.0002)
model_rf, opt_rf = amp.initialize(model_rf, opt_rf)
model_rf = nn.DataParallel(model_rf)
embeds = torch.randn(80, 64, 256)
training_set = TensorDataset(embeds, embeds)
training_generator = DataLoader(training_set, batch_size=batch_size, shuffle=True)
for i, batch in enumerate(training_generator):
embeds, _ = batch
h_final = model_rf(inputs_embeds=embeds.to(device))
And the error:
Traceback (most recent call last):
File "rf_4.py", line 35, in <module>
h_final = model_rf(inputs_embeds=embeds)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/usr/local/lib/python3.6/dist-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/transformers/modeling_reformer.py", line 1621, in forward
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/transformers/modeling_reformer.py", line 234, in forward
position_embeddings = self.position_embeddings(position_ids)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/transformers/modeling_reformer.py", line 170, in forward
[weight[:, :required_pos_encodings_columns] for weight in broadcasted_weights], dim=-1
File "/usr/local/lib/python3.6/dist-packages/apex/amp/wrap.py", line 81, in wrapper
return orig_fn(seq, *args, **kwargs)
RuntimeError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat. This usually means that this function requires a non-empty list of Tensors. Available functions are [CPUTensorId, CUDATensorId, QuantizedCPUTensorId, VariableTensorId]
This code kicks an error at the h_final line
transformers version: 3.0.2Update: This seems relevant https://github.com/pytorch/pytorch/issues/36035
I face the same error when using multi GPUs on Reformer model:
Traceback (most recent call last):
File "src/run_language_modeling.py", line 305, in <module>
main()
File "src/run_language_modeling.py", line 269, in main
trainer.train(model_path=model_path)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/trainer.py", line 499, in train
tr_loss += self._training_step(model, inputs, optimizer)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/trainer.py", line 632, in _training_step
outputs = model(**inputs)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas,wandb: Waiting for W&B process to finish, PID 20542
inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 1746, in forward
return_tuple=return_tuple,
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 1610, in forward
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 236, in forward
position_embeddings = self.position_embeddings(position_ids)
File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 143, in forward
weights = torch.cat(broadcasted_weights, dim=-1)
RuntimeError: There were no tensor arguments to this function (e.g., wandb: Program failed with code 1. Press ctrl-c to abort syncing.
you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat. This usually means that this function requires a non-empty list of Tensors. Available functions are [CPUTensorId, CUDATensorId, QuantizedCPUTensorId, VariableTensorId]
Out of curiosity, do you have the same error on PyTorch 1.4?
Out of curiosity, do you have the same error on PyTorch 1.4?
I stopped my GC instance - now there are none available. Maybe someone else can check?
In my case there's no error using torch-1.4.0, but got a warning:
07/16/2020 11:58:13 - INFO - transformers.trainer - ***** Running training *****
07/16/2020 11:58:13 - INFO - transformers.trainer - Num examples = 5444
07/16/2020 11:58:13 - INFO - transformers.trainer - Num Epochs = 12
07/16/2020 11:58:13 - INFO - transformers.trainer - Instantaneous batch size per device = 32
07/16/2020 11:58:13 - INFO - transformers.trainer - Total train batch size (w. parallel, distributed & accumulation) = 64
07/16/2020 11:58:13 - INFO - transformers.trainer - Gradient Accumulation steps = 1
07/16/2020 11:58:13 - INFO - transformers.trainer - Total optimization steps = 1000
Epoch: 0%| | 0/12 [00:00<?, ?it/s
home/qiwu/torch-1.4/lib/python3.7/site-packages/torch/nn/parallel/_functions.py:61:
UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
Found a relevant issue : https://github.com/huggingface/transformers/issues/852
https://discuss.pytorch.org/t/how-to-fix-gathering-dim-0-warning-in-multi-gpu-dataparallel-setting/41733/2
To be honest, I didn't check Reformer on multi-GPU yet - will note this issue down
@qwu01, what version of transformers are you using, and do you also have tokenizers installed? I get a vague segmentation fault error when I attempt to run about the same training script as above using torch==1.4.0, transformers==2.9.0, and tokenizers==0.7.0.
@jstremme I have these installed:
Package Version
argh 0.26.2
certifi 2020.6.20
chardet 3.0.4
click 7.1.2
configparser 5.0.0
docker-pycreds 0.4.0
filelock 3.0.12
gitdb 4.0.5
GitPython 3.1.7
gql 0.2.0
graphql-core 1.1
idna 2.10
joblib 0.16.0
numpy 1.18.4
nvidia-ml-py3 7.352.0
packaging 20.4
pathtools 0.1.2
pip 19.1.1
promise 2.3
psutil 5.7.0
pyparsing 2.4.7
python-dateutil 2.8.1
PyYAML 5.3.1
regex 2019.11.1
requests 2.24.0
sacremoses 0.0.43
sentencepiece 0.1.90
sentry-sdk 0.16.1
setuptools 41.0.1
shortuuid 1.0.1
six 1.15.0
smmap 3.0.4
subprocess32 3.5.3
tokenizers 0.8.1rc1
torch 1.4.0
tqdm 4.47.0
transformers 3.0.2
urllib3 1.25.9
wandb 0.9.3
watchdog 0.9.0
wheel 0.33.4
Thanks very much @qwu01. Just to confirm, downgrading torch to 1.4.0 allowed you to train Reformer with multiple GPUs? Did this impact anything else?
The environment I'm using does not allow me to install tokenizers 0.8.1rc1 and transformers 3.0.2 currently, but I will test your environment config as soon as I'm able :)
If you are actively working on training a large Reformer model, I would be interested in discussing your parameters. I'm dealing with sequences of max length around 300k SentencePiece tokens and am limited to batch size = 1. Multi-GPU should get me to batch size = 4.
@jstremme Yes, I'm sure that torch 1.4.0 with multiple GPUs worked for Reformer training. AFAICT it's not impacting anything else.
@qwu01, @anthonyfuller7, downgrading to torch==1.4.0 worked for me as well :D
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
Most helpful comment
I face the same error when using multi GPUs on Reformer model:
Traceback (most recent call last): File "src/run_language_modeling.py", line 305, in <module> main() File "src/run_language_modeling.py", line 269, in main trainer.train(model_path=model_path) File "/project/6006286/qiwu/from_git/transformers/src/transformers/trainer.py", line 499, in train tr_loss += self._training_step(model, inputs, optimizer) File "/project/6006286/qiwu/from_git/transformers/src/transformers/trainer.py", line 632, in _training_step outputs = model(**inputs) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply return parallel_apply(replicas,wandb: Waiting for W&B process to finish, PID 20542 inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply output.reraise() File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise raise self.exc_type(msg) RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker output = module(*input, **kwargs) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 1746, in forward return_tuple=return_tuple, File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 1610, in forward embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 236, in forward position_embeddings = self.position_embeddings(position_ids) File "/home/qiwu/protein-reformer-env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/project/6006286/qiwu/from_git/transformers/src/transformers/modeling_reformer.py", line 143, in forward weights = torch.cat(broadcasted_weights, dim=-1) RuntimeError: There were no tensor arguments to this function (e.g., wandb: Program failed with code 1. Press ctrl-c to abort syncing. you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat. This usually means that this function requires a non-empty list of Tensors. Available functions are [CPUTensorId, CUDATensorId, QuantizedCPUTensorId, VariableTensorId]