Bert: problem multiclass text classification

Created on 7 Apr 2019 · 14Comments · Source: google-research/bert

Hi,

I am trying to classify text in 34 mutually exclusive classes using BERT. After preparing train, dev and test TSV files, and I try to execute the command for training and testing

!python bert/run_classifier.py \ --task_name=cola \ --do_train=true \ --do_eval=true \ --data_dir=./Bert_Input_Folder \ --vocab_file=./uncased_L-24_H-1024_A-16/vocab.txt \ --bert_config_file=./uncased_L-24_H-1024_A-16/bert_config.json \ --init_checkpoint=./uncased_L-24_H-1024_A-16/bert_model.ckpt \ --max_seq_length=512 \ --train_batch_size=32 \ --learning_rate=2e-5 \ --num_train_epochs=3.0 \ --output_dir=./Bert_Output_Folder

I get the following error

`WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:

https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x7f4b945a01e0>) includes params argument, but params are not passed to Estimator.
INFO:tensorflow:Using config: {'_model_dir': './Bert_Output_Folder', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
rewrite_options {
meta_optimizer_iterations: ONE
}
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.
INFO:tensorflow:Writing example 0 of 23834
Traceback (most recent call last):
File "bert/run_classifier.py", line 981, in
tf.app.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "bert/run_classifier.py", line 870, in main
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
File "bert/run_classifier.py", line 490, in file_based_convert_examples_to_features
max_seq_length, tokenizer)
File "bert/run_classifier.py", line 459, in convert_single_example
label_id = label_map[example.label]
KeyError: '33'`

In the run_classifier.py file, I have modified the get_labels() function, originally written for a binary classification task, to return all 34 classes:
def get_labels(self): """See base class.""" return ["0", "1", "2", ..., "33"]

Any idea what is wrong or if I am missing additional steps?

Thanks!

Source

86mm86

Most helpful comment

Hello @86mm86 and @abhirut I am also trying to do basic multiclass problem on urdu corpus with 12 classes , i did change get_labels in cola processor as well as mentioned by @abhirut ,but its giving me following error.
`Please wait...
INFO:tensorflow:Writing example 0 of 3950

ValueError Traceback (most recent call last)
in ()
1 print('Please wait...')
2 train_features = run_classifier.convert_examples_to_features(
----> 3 train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
4 print('>> Started training at {} '.format(datetime.datetime.now()))
5 print(' Num examples = {}'.format(len(train_examples)))

Classification_bert\run_classifier.py in convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
753 tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
754 })
--> 755
756 if is_training:
757 d = d.repeat()

Classification_bert\run_classifier.py in convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer)
355 """See base class."""
356 return [str(x) for x in range(12)]
--> 357
358
359 def _create_examples(self, lines, set_type):

\Classification_bert\tokenization.py in tokenize(self, text)
170 def tokenize(self, text):
171 split_tokens = []
--> 172 for token in self.basic_tokenizer.tokenize(text):
173 for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 split_tokens.append(sub_token)

Classification_bert\tokenization.py in tokenize(self, text)
196 def tokenize(self, text):
197 """Tokenizes a piece of text."""
--> 198 text = convert_to_unicode(text)
199 text = self._clean_text(text)
200

Classification_bert\tokenization.py in convert_to_unicode(text)
84 return text.decode("utf-8", "ignore")
85 else:
---> 86 raise ValueError("Unsupported string type: %s" % (type(text)))
87 elif six.PY2:
88 if isinstance(text, str):

ValueError: Unsupported string type: `

My class label was string , which i transformed with labelencoder as well and passed as label_list for colaProcessor's get_label() function.

samreenkazi on 30 Apr 2019

👍3

All 14 comments

Did you get this to run?

abhirut on 13 Apr 2019

Are you sure you modified the ColaProcessor's get_labels function? I have 159 classes (a bit extreme) but I changed the get_label function in ColaProcessor as follows -

class ColaProcessor(DataProcessor):
  """Processor for the CoLA data set (GLUE version)."""

  def get_train_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

  def get_test_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

  def get_labels(self):
    """See base class."""
    return [str(x) for x in range(159)]

And it seems to work now.

abhirut on 13 Apr 2019

Hi,

Actually for some reason, even if the condition check [str(x) for x in range(34)] == ['0', '1', '2', ... '33'] is True (as it should be), your approach solved the problem. Thanks!

Now unfortunately I have another issue: the training seems to be stuck at the checkpoint 0. Here is the last output

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/learning_rate_decay_v2.py:321: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Deprecated in favor of operator or tf.math.divide. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Create CheckpointSaverHook. INFO:tensorflow:Graph was finalized. 2019-04-13 10:40:33.771440: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300000000 Hz 2019-04-13 10:40:33.773999: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x157e520 executing computations on platform Host. Devices: 2019-04-13 10:40:33.774044: I tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device (0): <undefined>, <undefined> INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Saving checkpoints for 0 into ./Bert_Output_Folder/model.ckpt.

I am running the training in a Google Colab notebook, and the command is still in execution after 2 hours of waiting. I have googled the problem and it seems that it is an expected behaviour, as described here #212. I will open another issue if this is not the case.

86mm86 on 13 Apr 2019

Oh, I see that it's running on a CPU. Would it be faster if you ran it on a GPU? I'm running it on a K80, and reduced batch size and max sequence length (to fit stuff into memory) but I see these outputs in a few minutes -

INFO:tensorflow:global_step/sec: 0.868662
INFO:tensorflow:examples/sec: 6.94929
INFO:tensorflow:global_step/sec: 0.923851
INFO:tensorflow:examples/sec: 7.39081
INFO:tensorflow:global_step/sec: 0.934779
INFO:tensorflow:examples/sec: 7.47823

abhirut on 13 Apr 2019

I tried to run the training on a GPU in Colab, but I soon ran out of memory and I do not have a GPU on my local machine. But after reducing the min_batch_size and max_seq_length parameters as you suggested it worked just fine. Thank you!

86mm86 on 14 Apr 2019

ValueError: Unsupported string type: `

My class label was string , which i transformed with labelencoder as well and passed as label_list for colaProcessor's get_label() function.

samreenkazi on 30 Apr 2019

👍3

I'm having the same error as @samreenkazi . I have string label and used labelencoder to tranform it into a list. The slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

tbs17 on 25 Oct 2019

I'm having the same error as @samreenkazi . I have string label and used labelencoder to tranform it into a list. The slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

I have the same problem, could you resolve it ?

sabra333 on 4 Dec 2019

you need to modify the run script to include more classes. the original
script is binary...

On Wed, Dec 4, 2019 at 1:59 AM sabra333 notifications@github.com wrote:

I'm having the same error as @samreenkazi https://github.com/samreenkazi
. I have string label and used labelencoder to tranform it into a list. The
slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

I have the same problem, could you resolve it ?

—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/google-research/bert/issues/559?email_source=notifications&email_token=AIKT4C5MRDGEJJWNMS4Y5F3QW5ILHA5CNFSM4HECCFYKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOEF36MKY#issuecomment-561505835,
or unsubscribe
https://github.com/notifications/unsubscribe-auth/AIKT4C2XKLT2GNBXUJUQ5OLQW5ILHANCNFSM4HECCFYA
.

tbs17 on 4 Dec 2019

I'm having the same error as @samreenkazi . I have string label and used labelencoder to tranform it into a list. The slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

Me too. Why am I getting this error?

swlee1223 on 17 Dec 2019

You will have to adjust the run.py to include more classes. Someone above
already shared the code.

On Tue, Dec 17, 2019, 5:07 PM swlee1223 notifications@github.com wrote:

I'm having the same error as @samreenkazi https://github.com/samreenkazi
. I have string label and used labelencoder to tranform it into a list. The
slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

Me too. Why am I getting this error?

—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/google-research/bert/issues/559?email_source=notifications&email_token=AIKT4CY6H2NHCNIG6UWRQYDQZFETTA5CNFSM4HECCFYKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOEHED2CQ#issuecomment-566770954,
or unsubscribe
https://github.com/notifications/unsubscribe-auth/AIKT4CZEO25UXR6P4A4CVO3QZFETTANCNFSM4HECCFYA
.

tbs17 on 18 Dec 2019

I'm having the same error as @samreenkazi . I have string label and used labelencoder to tranform it into a list. The slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

I had the same issue and was editing the script to make it work. But finally i noticed that one of the entry in the dataframe column was NaN and that was the reason for the error. Then I simpy replced the entry with 'No Text' using fillna function and to my surprise it worked out. So please check the dataframe input.

ShrikanthSingh on 18 Dec 2019

that's right, Shrikanth! Also make sure your column name doesn't have any
leading or trailing zeros...

On Wed, Dec 18, 2019 at 11:18 AM Shrikanth Singh notifications@github.com
wrote:

I'm having the same error as @samreenkazi https://github.com/samreenkazi
. I have string label and used labelencoder to tranform it into a list. The
slight different error term is the last line. Can someone help?

ValueError: Unsupported string type:

I had the same issue and was editing the script to make it work. But
finally i noticed that one of the entry in the dataframe column was NaN and
that was the reason for the error. Then I simpy replced the entry with 'No
Text' using fillna function and to my surprise it worked out. So please
check the dataframe input

—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/google-research/bert/issues/559?email_source=notifications&email_token=AIKT4CYLEDCVMNTUOVIOV5LQZJEMHA5CNFSM4HECCFYKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOEHGU5FI#issuecomment-567103125,
or unsubscribe
https://github.com/notifications/unsubscribe-auth/AIKT4C52BCFFUX5XLAROBYLQZJEMHANCNFSM4HECCFYA
.

tbs17 on 18 Dec 2019

👍1

Sounds like it's still the label issue. It has to be string type. If your
label has been converted number value, make sure it's string in the model
as well.

On Sun, Sep 13, 2020, 1:29 AM Mrudhulraj notifications@github.com wrote:

Hey, So I have also been working on BERT and I got the given below error
and my dataset labels are already labeled and hence there is no need for
LabelEnoder.There are also no Nan values in my dataset. Any suggestions?

ValueError Traceback (most recent call last)
in
1 MAX_SEQ_LENGTH = 100
----> 2 train_features =
bert.run_classifier.convert_examples_to_features(train_InputExamples,
labels, MAX_SEQ_LENGTH, tokenizer)
3 test_features =
bert.run_classifier.convert_examples_to_features(test_InputExamples,
labels, MAX_SEQ_LENGTH, tokenizer)

~\anaconda3\envs\my_projects\lib\site-packages\bert\run_classifier.py in
convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer)
778
779 feature = convert_single_example(ex_index, example, label_list,
--> 780 max_seq_length, tokenizer)
781
782 features.append(feature)

~\anaconda3\envs\my_projects\lib\site-packages\bert\run_classifier.py in
convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer)
394 label_map[label] = i
395
--> 396 tokens_a = tokenizer.tokenize(example.text_a)
397 tokens_b = None
398 if example.text_b:

~\anaconda3\envs\my_projects\lib\site-packages\bert\tokenization.py in
tokenize(self, text)
190 def tokenize(self, text):
191 split_tokens = []
--> 192 for token in self.basic_tokenizer.tokenize(text):
193 if preserve_token(token, self.vocab):
194 split_tokens.append(token)

~\anaconda3\envs\my_projects\lib\site-packages\bert\tokenization.py in
tokenize(self, text)
221 def tokenize(self, text):
222 """Tokenizes a piece of text."""
--> 223 text = convert_to_unicode(text)
224 text = self._clean_text(text)
225

~\anaconda3\envs\my_projects\lib\site-packages\bert\tokenization.py in
convert_to_unicode(text)
104 return text.decode("utf-8", "ignore")
105 else:
--> 106 raise ValueError("Unsupported string type: %s" % (type(text)))
107 elif six.PY2:
108 if isinstance(text, str):

ValueError: Unsupported string type:

—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/google-research/bert/issues/559#issuecomment-691617112,
or unsubscribe
https://github.com/notifications/unsubscribe-auth/AIKT4C3NEMFPJ4R762MH2OTSFRKEPANCNFSM4HECCFYA
.