Addons: Tqdm Callback Fails to run with tf.data API

Created on 17 Apr 2020  路  7Comments  路  Source: tensorflow/addons

Using:
TensorFlow 2.1.0
TensorFlow Addons 0.9.1

Describe the bug

Tqdm Callback fails to run with tf.data API

Code to reproduce the issue

import tensorflow as tf
print(f"tf.__version__: {tf.__version__}")
import tensorflow_addons as tfa
from tensorflow.keras import backend as K
from tensorflow.keras.datasets import mnist
import tensorflow_datasets as tfds
import os

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    strategy = tf.distribute.get_strategy()
    raise BaseException('ERROR: Not connected to a TPU runtime.')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

print("REPLICAS: ", tpu_strategy.num_replicas_in_sync)

def get_dataset(batch_size=200):
  datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True,
                             try_gcs=True)
  mnist_train, mnist_test = datasets['train'], datasets['test']

  def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.0

    return image, label

  train_dataset = mnist_train.map(scale).shuffle(10000).batch(batch_size)
  test_dataset = mnist_test.map(scale).batch(batch_size)

  return train_dataset, test_dataset

def create_model():
  return tf.keras.Sequential(
      [tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
       tf.keras.layers.Flatten(),
       tf.keras.layers.Dense(128, activation='relu'),
       tf.keras.layers.Dense(10)])

train_dataset, test_dataset = get_dataset()

with tpu_strategy.scope():
  model = create_model()
  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['sparse_categorical_accuracy'])

model.fit(train_dataset,epochs=10,validation_data=test_dataset,callbacks=[tfa.callbacks.TQDMProgressBar()],verbose=0)

Error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in on_epoch(self, epoch, mode)
    766     try:
--> 767       yield epoch_logs
    768     finally:

13 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    341                 training_context=training_context,
--> 342                 total_epochs=epochs)
    343             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    180       cbks.make_logs(model, batch_logs, batch_outs, mode)
--> 181       step += 1
    182 

/usr/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     87             try:
---> 88                 next(self.gen)
     89             except StopIteration:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in on_batch(self, step, mode, size)
    787           self.callbacks._call_batch_hook(
--> 788               mode, 'end', step, batch_logs)
    789           self.progbar.on_batch_end(step, batch_logs)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
    238       batch_hook = getattr(callback, hook_name)
--> 239       batch_hook(batch, logs)
    240     self._delta_ts[hook_name].append(time.time() - t_before_callbacks)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
    527     # For backwards compatibility.
--> 528     self.on_batch_end(batch, logs=logs)
    529 

/usr/local/lib/python3.6/dist-packages/tensorflow_addons/callbacks/tqdm_progress_bar.py in on_batch_end(self, batch, logs)
    169 
--> 170         if self.steps_so_far < self.total_steps:
    171 

TypeError: '<' not supported between instances of 'int' and 'NoneType'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-21-0096553acbe6> in <module>()
     51                 metrics=['sparse_categorical_accuracy'])
     52 
---> 53 model.fit(train_dataset,epochs=10,validation_data=test_dataset,callbacks=[tfa.callbacks.TQDMProgressBar()])

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817         max_queue_size=max_queue_size,
    818         workers=workers,
--> 819         use_multiprocessing=use_multiprocessing)
    820 
    821   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    395                       total_epochs=1)
    396                   cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST,
--> 397                                  prefix='val_')
    398 
    399     return model.history

/usr/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     97                 value = type()
     98             try:
---> 99                 self.gen.throw(type, value, traceback)
    100             except StopIteration as exc:
    101                 # Suppress StopIteration *unless* it's the same exception that

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2.py in on_epoch(self, epoch, mode)
    769       if mode == ModeKeys.TRAIN:
    770         # Epochs only apply to `fit`.
--> 771         self.callbacks.on_epoch_end(epoch, epoch_logs)
    772       self.progbar.on_epoch_end(epoch, epoch_logs)
    773 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/callbacks.py in on_epoch_end(self, epoch, logs)
    300     logs = logs or {}
    301     for callback in self.callbacks:
--> 302       callback.on_epoch_end(epoch, logs)
    303 
    304   def on_train_batch_begin(self, batch, logs=None):

/usr/local/lib/python3.6/dist-packages/tensorflow_addons/callbacks/tqdm_progress_bar.py in on_epoch_end(self, epoch, logs)
    151             # update the rest of the steps in epoch progress bar
    152             self.epoch_progress_tqdm.update(
--> 153                 self.total_steps - self.epoch_progress_tqdm.n
    154             )
    155             self.epoch_progress_tqdm.close()

TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
bug help wanted

Most helpful comment

@sourcecode369 please be careful with your phrasing as it sound a bit like an order (which I know it isn't of course). Something like "I need it for my current project, it would be great if we could find a fix soon" would sound much better :)

All 7 comments

@shun-lin

I will take a look thanks @sourcecode369

Yes, @shun-lin thank you so much. Hope you find a fix soon.

@sourcecode369 please be careful with your phrasing as it sound a bit like an order (which I know it isn't of course). Something like "I need it for my current project, it would be great if we could find a fix soon" would sound much better :)

Hi @sourcecode369, can you update your tensorflow package to 2.2.0-rc3? This issue is fixed in 2.2.0-rc3, I just checked in colab it should work without any error.

Hi @shun-lin , thanks for your quick response. Updating to tf 2.2.0-rc3 fixed the issue for me.

sounds good, will close the issue then :)

Was this page helpful?
0 / 5 - 0 ratings

Related issues

shun-lin picture shun-lin  路  4Comments

seanpmorgan picture seanpmorgan  路  4Comments

seanpmorgan picture seanpmorgan  路  4Comments

WindQAQ picture WindQAQ  路  4Comments

seanpmorgan picture seanpmorgan  路  3Comments