I am training the model on this dataset. It consists of images of 10 brands of cigarette packs on supermarket shelves along with their bounding boxes. All brands that are not part of these 10 brands are labelled as 0. Thus, in total there are 11 classes of objects to detect.
I used the faster_rcnn_resnet101 model and created my own TFRecord files. I got up to speed by following the Pet Detection tutorial.
I performed the training locally on a machine with 20 vCPUs and 130 GB RAM. Ran the training for 16000 global steps. The model IS able to detect cigarette boxes but it is not working as expected -
If there are two objects of the same class placed right next to each other on the shelf, it detects one but misses the other, which is not the expected behavior if the model learns the features correctly. https://i.stack.imgur.com/cc6hV.jpg
When I feed in negative images of supermarket shelves that don't contain any of the 11 classes of objects, it still marks bounding boxes with 99% confidence and labelling them as belonging to category 1 which doesn't make sense! https://i.stack.imgur.com/oi4Ax.png
Though it is not clear in the images, I can assure you that all the boxes have been labelled as belonging to category 1.
What is going wrong?
Config file
# Faster R-CNN with Resnet-101 (v1) configured for the Oxford-IIIT Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for to find the fields that
# should be configured.
model {
faster_rcnn {
num_classes: 11
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 600
max_dimension: 1024
}
}
feature_extractor {
type: 'faster_rcnn_resnet101'
first_stage_features_stride: 16
}
first_stage_anchor_generator {
grid_anchor_generator {
scales: [0.25, 0.5, 1.0, 2.0]
aspect_ratios: [0.5, 1.0, 2.0]
height_stride: 16
width_stride: 16
}
}
first_stage_box_predictor_conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
first_stage_nms_score_threshold: 0.0
first_stage_nms_iou_threshold: 0.7
first_stage_max_proposals: 300
first_stage_localization_loss_weight: 2.0
first_stage_objectness_loss_weight: 1.0
initial_crop_size: 14
maxpool_kernel_size: 2
maxpool_stride: 2
second_stage_box_predictor {
mask_rcnn_box_predictor {
use_dropout: false
dropout_keep_probability: 1.0
fc_hyperparams {
op: FC
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
}
second_stage_post_processing {
batch_non_max_suppression {
score_threshold: 0.0
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 300
}
score_converter: SOFTMAX
}
second_stage_localization_loss_weight: 2.0
second_stage_classification_loss_weight: 1.0
}
}
train_config: {
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.0003
schedule {
step: 0
learning_rate: .0003
}
schedule {
step: 900000
learning_rate: .00003
}
schedule {
step: 1200000
learning_rate: .000003
}
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
gradient_clipping_by_norm: 10.0
fine_tune_checkpoint: "data/model.ckpt"
from_detection_checkpoint: true
data_augmentation_options {
random_horizontal_flip {
}
}
}
train_input_reader: {
tf_record_input_reader {
input_path: "data/cig_train.record"
}
label_map_path: "data/cig_label_map.pbtxt"
}
eval_config: {
num_examples: 2000
}
eval_input_reader: {
tf_record_input_reader {
input_path: "data/cig_val.record"
}
label_map_path: "data/cig_label_map.pbtxt"
}
label map
item {
id: 1
name: '1'
}
item {
id: 2
name: '2'
}
item {
id: 3
name: '3'
}
item {
id: 4
name: '4'
}
item {
id: 5
name: '5'
}
item {
id: 6
name: '6'
}
item {
id: 7
name: '7'
}
item {
id: 8
name: '8'
}
item {
id: 9
name: '9'
}
item {
id: 10
name: '10'
}
item {
id: 11
name: '11'
}
TFRecord Generator
import hashlib
import io
import logging
import os
import random
import re
from lxml import etree
import PIL.Image
import tensorflow as tf
from utils import dataset_util
from utils import label_map_util
flags = tf.app.flags
flags.DEFINE_string('data_dir', '', 'Root directory to raw pet dataset.')
flags.DEFINE_string('output_dir', '', 'Path to directory to output TFRecords.')
flags.DEFINE_string('label_map_path', 'data/pet_label_map.pbtxt', 'Path to label map proto')
FLAGS = flags.FLAGS
def get_class_name_from_filename(file_name):
"""Gets the class name from a file.
Args:
file_name: The file name to get the class name from.
ie. "american_pit_bull_terrier_105.jpg"
Returns:
example: The converted tf.Example.
"""
match = re.match(r'([A-Za-z_]+)(_[0-9]+\.jpg)', file_name, re.I)
return match.groups()[0]
def dict_to_tf_example(data,
label_map_dict,
image_subdirectory,
ignore_difficult_instances=False):
"""Convert XML derived dict to tf.Example proto.
Notice that this function normalizes the bounding box coordinates provided
by the raw data.
Args:
data: dict holding PASCAL XML fields for a single image (obtained by
running dataset_util.recursive_parse_xml_to_dict)
label_map_dict: A map from string label names to integers ids.
image_subdirectory: String specifying subdirectory within the
Pascal dataset directory holding the actual image data.
ignore_difficult_instances: Whether to skip difficult instances in the
dataset (default: False).
Returns:
example: The converted tf.Example.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG
"""
data = data.strip().split(' ')
img_path = os.path.join(image_subdirectory, data[0])
print img_path
with tf.gfile.GFile(img_path, 'rb') as fid:
encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = PIL.Image.open(encoded_jpg_io)
#if image.format != 'JPEG':
# raise ValueError('Image format not JPEG')
key = hashlib.sha256(encoded_jpg).hexdigest()
width, height = image.size
# width = int(data['size']['width'])
# height = int(data['size']['height'])
num_boxes = int(data[1])
xmin = []
ymin = []
xmax = []
ymax = []
classes = []
classes_text = []
truncated = []
poses = []
difficult_obj = []
for i in xrange(num_boxes):
xmin.append(int(data[2 + 5 * i]))
ymin.append(int(data[3 + 5 * i]))
tw = int(data[4 + 5 * i])
th = int(data[5 + 5 * i])
xmax.append(xmin[-1] + tw)
ymax.append(ymin[-1] + th)
xmin[-1] = float(xmin[-1]) / width
ymin[-1] = float(ymin[-1]) / height
xmax[-1] = float(xmax[-1]) / width
ymax[-1] = float(ymax[-1]) / height
classes.append(int(data[6 + 5 * i]) + 1)
classes_text.append(label_map_dict[classes[-1]].encode('utf8'))
truncated.append(0)
poses.append('Frontal'.encode('utf8'))
difficult_obj.append(0)
'''
for obj in data['object']:
difficult = bool(int(obj['difficult']))
if ignore_difficult_instances and difficult:
continue
difficult_obj.append(int(difficult))
xmin.append(float(obj['bndbox']['xmin']) / width)
ymin.append(float(obj['bndbox']['ymin']) / height)
xmax.append(float(obj['bndbox']['xmax']) / width)
ymax.append(float(obj['bndbox']['ymax']) / height)
class_name = get_class_name_from_filename(data['filename'])
classes_text.append(class_name.encode('utf8'))
classes.append(label_map_dict[class_name])
truncated.append(int(obj['truncated']))
poses.append(obj['pose'].encode('utf8'))
'''
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
'image/width': dataset_util.int64_feature(width),
'image/filename': dataset_util.bytes_feature(
data[0].encode('utf8')),
'image/source_id': dataset_util.bytes_feature(
data[0].encode('utf8')),
'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
'image/encoded': dataset_util.bytes_feature(encoded_jpg),
'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
'image/object/bbox/xmin': dataset_util.float_list_feature(xmin),
'image/object/bbox/xmax': dataset_util.float_list_feature(xmax),
'image/object/bbox/ymin': dataset_util.float_list_feature(ymin),
'image/object/bbox/ymax': dataset_util.float_list_feature(ymax),
'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
'image/object/class/label': dataset_util.int64_list_feature(classes),
'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
'image/object/truncated': dataset_util.int64_list_feature(truncated),
'image/object/view': dataset_util.bytes_list_feature(poses),
}))
return example
def create_tf_record(output_filename,
label_map_dict,
annotations_dir,
image_dir,
examples):
"""Creates a TFRecord file from examples.
Args:
output_filename: Path to where output file is saved.
label_map_dict: The label map dictionary.
annotations_dir: Directory where annotation files are stored.
image_dir: Directory where image files are stored.
examples: Examples to parse and save to tf record.
"""
with tf.gfile.GFile(os.path.join(annotations_dir, 'annotation.txt')) as fid:
lines = fid.readlines()
writer = tf.python_io.TFRecordWriter(output_filename)
for idx, example in enumerate(examples):
if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples))
pos = 0
while pos < len(lines):
line = lines[pos]
line = line.strip().split(' ')
if line[0] == example:
break
pos += 1
if pos >= len(lines):
logging.info('filename not found in annotations.txt')
'''
path = os.path.join(annotations_dir, 'xmls', example + '.xml')
if not os.path.exists(path):
logging.warning('Could not find %s, ignoring example.', path)
continue
with tf.gfile.GFile(path, 'r') as fid:
xml_str = fid.read()
xml = etree.fromstring(xml_str)
data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation']
'''
tf_example = dict_to_tf_example(lines[pos], label_map_dict, image_dir)
writer.write(tf_example.SerializeToString())
writer.close()
# TODO: Add test for pet/PASCAL main files.
def main(_):
data_dir = FLAGS.data_dir
'''label_map_dict = {
1 : '1',
2 : '2',
3 : '3',
4 : '4',
5 : '5',
6 : '6',
7 : '7',
8 : '8',
9 : '9',
10 : '10',
11 : '11',
} '''
label_map_dict = label_map_util.get_label_map_dict('/root/stuff/tensorflow/models/data/cig_label_map.pbtxt')
tmpp = {}
for key in label_map_dict:
tmpp[label_map_dict[key]] = key
label_map_dict = tmpp
logging.info('Reading from cigarette dataset.')
image_dir = os.path.join(data_dir, 'ShelfImages')
annotations_dir = os.path.join(data_dir, '')
# examples_list = dataset_util.read_examples_list(examples_path)
with tf.gfile.GFile(os.path.join(annotations_dir, 'annotation.txt')) as fid:
lines = fid.readlines()
examples_list = [line.strip().split(' ')[0] for line in lines]
# Test images are not included in the downloaded data set, so we shall perform
# our own split.
random.seed(42)
random.shuffle(examples_list)
num_examples = len(examples_list)
num_train = int(0.7 * num_examples)
num_train = 100
train_examples = examples_list[:num_train]
val_examples = examples_list[num_train: num_train + 50]
logging.info('%d training and %d validation examples.', len(train_examples), len(val_examples))
train_output_path = os.path.join(FLAGS.output_dir, 'cig_train.record')
val_output_path = os.path.join(FLAGS.output_dir, 'cig_val.record')
print 1
create_tf_record(train_output_path, label_map_dict, annotations_dir, image_dir, train_examples)
print 2
create_tf_record(val_output_path, label_map_dict, annotations_dir, image_dir, val_examples)
print 3
if __name__ == '__main__':
tf.app.run()
This question is better asked on StackOverflow since it is not a bug or feature request. There is also a larger community that reads questions there. Thanks!
Could you resolve the issue? I have similar problem.
@Yuvarajganesh112 Having same issue. Any solution?
You can refer to my answer on Stackoverflow.
This could be to do with a bias with your training data. If you have not got approx equal amounts of training data for each class, or one class has better training data than another class, you can get a bias with your model.
@VastoLorde95 replace the lines from 1 to 40 of your generate_tfrecord python scripts with this:
<<<<-----------------------------------------------------------------------------------------
"""
Usage:
# From tensorflow/models/
# Create train data:
python generate_tfrecord.py --csv_input=images/train_labels.csv --image_dir=images/train --output_path=train.record
# Create test data:
python generate_tfrecord.py --csv_input=images/test_labels.csv --image_dir=images/test --output_path=test.record
"""
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
import os
import io
import pandas as pd
import tensorflow as tf
from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
flags = tf.app.flags
flags.DEFINE_string('csv_input', '', 'Path to the CSV input')
flags.DEFINE_string('image_dir', '', 'Path to the image directory')
flags.DEFINE_string('output_path', '', 'Path to output TFRecord')
FLAGS = flags.FLAGS
def class_text_to_int(row_label):
if row_label == '2ENG':
return 1
elif row_label == '4ENG':
return 2
elif row_label == 'backENG':
return 3
else:
return None
------------------------------------------------------------------------------------------->>>>>
then change # TO-DO replace this with label map to your label map.
all the best...
Most helpful comment
Could you resolve the issue? I have similar problem.