hi, I always get this problem every time when tranning translation model with my own language pair. I have searched issues about "Model diverged with loss = NaN", but i did not got any idea, so I post a issue here.
here is my code. Thank you !
@registry.register_hparams('transformer_jack')
def transformer_jack():
hparams = transformer_base_single_gpu()
hparams.learning_rate_warmup_steps=4000
hparams.learning_rate=0.01
hparams.optimizer_adam_beta2=0.98
hparams.initializer="normal_unit_scaling"
return hparams
translate_mnzh.py:
FLAGS = tf.flags.FLAGS
EOS = text_encoder.EOS_ID
@registry.register_problem
class TranslateMnzhBpe32k(translate.TranslateProblem):
@property
def targeted_vocab_size(self):
return 32000
@property
def source_vocab_name(self):
return "vocab.32k.mn.txt"
@property
def target_vocab_name(self):
return "vocab.32k.ch.txt"
def feature_encoders(self, data_dir):
source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
source_encoder = text_encoder.TokenTextEncoder(source_vocab_filename, replace_oov="UNK")
target_encoder = text_encoder.TokenTextEncoder(target_vocab_filename, replace_oov="UNK")
return {"inputs": source_encoder, "targets": target_encoder}
def generator(self, data_dir, tmp_dir, train):
"""Instance of token generator for the mn->zh task, training set."""
dataset_path = ("train.32k"
if train else "valid.32k")
train_path = os.path.join(data_dir, dataset_path)
source_token_path = os.path.join(data_dir, self.source_vocab_name)
target_token_path = os.path.join(data_dir, self.target_vocab_name)
source_token_vocab = text_encoder.TokenTextEncoder(source_token_path, replace_oov="UNK")
target_token_vocab = text_encoder.TokenTextEncoder(target_token_path, replace_oov="UNK")
return translate.token_generator_by_source_target(train_path + ".mn", train_path + ".ch",
source_token_vocab, target_token_vocab, EOS)
@property
def input_space_id(self):
return problem.SpaceID.MN_BPE_TOK
@property
def target_space_id(self):
return problem.SpaceID.ZH_BPE_TOK
error output:
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "/home/jack/tools/ana2/envs/tfgpu1.4-3.5/bin/t2t-trainer", line 4, in <module>
__import__('pkg_resources').run_script('tensor2tensor==1.4.3', 't2t-trainer')
File "/home/jack/.local/lib/python3.5/site-packages/pkg_resources/__init__.py", line 750, in run_script
self.require(requires)[0].run_script(script_name, ns)
File "/home/jack/.local/lib/python3.5/site-packages/pkg_resources/__init__.py", line 1527, in run_script
exec(code, namespace, namespace)
File "/home/jack/tools/ana2/envs/tfgpu1.4-3.5/lib/python3.5/site-packages/tensor2tensor-1.4.3-py3.5.egg/EGG-INFO/scripts/t2t-trainer", line 16, in <module>
tf.app.run()
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/platform/app.py", line 124, in run
_sys.exit(main(argv))
File "/home/jack/tools/ana2/envs/tfgpu1.4-3.5/lib/python3.5/site-packages/tensor2tensor-1.4.3-py3.5.egg/EGG-INFO/scripts/t2t-trainer", line 12, in main
t2t_trainer.main(argv)
File "/home/jack/tools/ana2/envs/tfgpu1.4-3.5/lib/python3.5/site-packages/tensor2tensor-1.4.3-py3.5.egg/tensor2tensor/bin/t2t_trainer.py", line 271, in main
execute_schedule(exp)
File "/home/jack/tools/ana2/envs/tfgpu1.4-3.5/lib/python3.5/site-packages/tensor2tensor-1.4.3-py3.5.egg/tensor2tensor/bin/t2t_trainer.py", line 229, in execute_schedule
getattr(exp, FLAGS.schedule)()
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/contrib/framework/python/framework/experimental.py", line 64, in new_func
return func(*args, **kwargs)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 770, in continuous_train_and_eval
saving_listeners=self._saving_listeners)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 868, in _call_train
saving_listeners=saving_listeners)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 314, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 815, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py", line 539, in run
run_metadata=run_metadata)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py", line 1013, in run
run_metadata=run_metadata)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py", line 1104, in run
raise six.reraise(*original_exc_info)
File "/home/jack/.local/lib/python3.5/site-packages/six.py", line 693, in reraise
raise value
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py", line 1089, in run
return self._sess.run(*args, **kwargs)
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py", line 1169, in run
run_metadata=run_metadata))
File "/home/jack/.local/lib/python3.5/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 605, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
When training on GPU, the error "Model diverged with loss = NaN" is often caused by a sotmax that's getting a symbol larger than vocab_size. Please train on CPU for a few steps, the error message is much better. I suspect that in your data there are symbols larger than the vocab_size the model is getting (so larger than the softmax you create).
Thank you @lukaszkaiser, I got the vocabulary from training set on my own. What is the "symbol" related to vocab_size ? Is it necessary to configure it in my code ? Do we have to generate vocabulary on our own ? Is it possible for t2t to help us generate the vocabulary?
@lukaszkaiser . I suffer the same result when I set the OOV index that never larger than the vocab_size. Do you have any other findings to illustrate the problem ?
thanks @lukaszkaiser . I got the same error when pre-training bert. It turns out that the size of vocab.txt is larger than the param 'vocab_size' passed to the model.
Most helpful comment
When training on GPU, the error "Model diverged with loss = NaN" is often caused by a sotmax that's getting a symbol larger than vocab_size. Please train on CPU for a few steps, the error message is much better. I suspect that in your data there are symbols larger than the vocab_size the model is getting (so larger than the softmax you create).