Mask_rcnn: epoch got stuck in the first round on keras_model.fit_generator when training own dataset

Created on 6 Nov 2018  路  6Comments  路  Source: matterport/Mask_RCNN

My environment:
windows 10 x64
python 3.6.6
cuda 9.0, V9.0.176
cudnn 7.3.1
tensorflow 1.11.0
keras 2.2.4.

`config = tensorflow.ConfigProto()
config.gpu_options.allow_growth = True
session = tensorflow.Session(config=config)
ROOT_DIR = os.path.abspath("C:/tool/Mask_RCNN/")
sys.path.append(ROOT_DIR)  # To find local version of the library
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
if not os.path.exists(COCO_MODEL_PATH):
    print(COCO_MODEL_PATH)
    utils.download_trained_weights(COCO_MODEL_PATH)
class DrugDataset(utils.Dataset):
    def get_obj_index(self, image):
        n = np.max(image)
        return n
    def from_yaml_get_class(self,image_id):
        info=self.image_info[image_id]
        with open(info['yaml_path']) as f:
            temp=yaml.load(f.read())
            labels=temp['label_names']
            del labels[0]
        return labels
    def load_shapes(self, img_number, count, height, width, img_folder, mask_folder, imglist,dataset_root_path):
        self.add_class("Newton Ring", 1, "rainbow")

        for i in range(count):
            filestr = imglist[img_number[i]-1]
            mask_path = mask_folder + "/" + filestr + "/maskrs.jpg"
            yaml_path=mask_folder + "/" + filestr + "/info.yaml"
            self.add_image("Newton Ring", image_id=i, path=img_folder + "/" + imglist[img_number[i]-1]+"/imgrs.jpg",
                           width=width, height=height, mask_path=mask_path,yaml_path=yaml_path)

class ShapesConfig(Config):
    NAME = "Newton Ring"
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    NUM_CLASSES = 1 + 1  # background + 3 shapes
    IMAGE_MIN_DIM = 1024
    IMAGE_MAX_DIM = 1024
    RPN_ANCHOR_SCALES = (8 * 6, 16 * 6, 32 * 6, 64 * 6, 128 * 6) #(32, 64, 128, 256, 512)  # anchor side in pixels
    TRAIN_ROIS_PER_IMAGE = 32
    STEPS_PER_EPOCH = 100
    VALIDATION_STEPS = 5
    iter_num = 0
config = ShapesConfig()
config.display()

dataset_root_path = "D:/PROJECT/Dataset/HSIR/imageandmask"
img_folder = dataset_root_path
mask_folder = dataset_root_path
imglist = os.listdir(img_folder)
count = len(imglist)
width = 1224
height = 1024
x = range(1,count+1)
x_train, x_other = train_test_split(x, test_size=0.02, random_state=42)
x_val,x_test= train_test_split(x_other, test_size=0.8, random_state=42)

dataset_train = DrugDataset()
dataset_train.load_shapes(x_train,len(x_train), height, width, img_folder, mask_folder, imglist,dataset_root_path)
dataset_train.prepare()

dataset_val = DrugDataset()
dataset_val.load_shapes(x_val,len(x_val), height, width, img_folder,mask_folder, imglist,dataset_root_path)
dataset_val.prepare()

data = dict(
    dataname = 'rainbow_test',
    index_test = x_test
)

with open('testdata.yml', 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

print("config and divide the dataset")
model = modellib.MaskRCNN(mode="training", config=config,
                          model_dir=MODEL_DIR)
init_with = "coco"  # imagenet, coco, or last

if init_with == "imagenet":
    model.load_weights(model.get_imagenet_weights(), by_name=True)
elif init_with == "coco":
    model.load_weights(COCO_MODEL_PATH, by_name=True,
                       exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", 
                                "mrcnn_bbox", "mrcnn_mask"])
elif init_with == "last":
    model.load_weights(model.find_last(), by_name=True)
print("start training")
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE, 
            epochs=5, 
            layers='heads')
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE / 10,
            epochs=10, 
            layers="all")
print("save model")
model_path = os.path.join(MODEL_DIR, "mask_rcnn_shapes.h5")
model.keras_model.save_weights(model_path)

I modified train_shapes.py and added cross validations
my dataset :
1896 images 1224x1024 only 1 class and 1 object for each image
and I got stuck on the first epoch 1/5, there is almost 0 % used in GPU nothing changed for hours.
Then I traced the code and found it stuck at model.py 'keras_model.fit_generator()'.
I tried the solutions in #287
trying setting workers=1, use_multiprocessing=False
or update keras
nothing worked out.
is there anyone who encountered this problem and find others solutions?
help me please

All 6 comments

someone told me the main problem is about the version of keras
but after I changed to tensorflow 1.6.0 keras 2.1.6.
it was still not working on mac for no gpu

I tried to traced the code by print something every step
then I found when I worked on balloon.py
it can executed step by step from:
Epoch 1/10
-1
build rpn
Init batch arrays
Add to batch
batch b= 1
1/100 [..............................] - ETA: 23:05 - loss: 4.0967 - rpn_class_loss: 0.0105 - rpn_bbox_loss: 0.2817 - mrcnn_class_loss: 2.6626 - mrcnn_bbox_loss: 0.5993 - mrcnn_mask_loss: 0.54260
build rpn
Init batch arrays
Add to batch
batch b= 1
2/100 [......
but when I executed my code
there was nothing printed out after Epoch1/5:

when I tried

b = 0  # batch item index
    image_index = -1
    image_ids = np.copy(dataset.image_ids)
    error_count = 0
    no_augmentation_sources = no_augmentation_sources or []

    # Anchors
    # [anchor_count, (y1, x1, y2, x2)]
    print("initial")
    backbone_shapes = compute_backbone_shapes(config, config.IMAGE_SHAPE)
    anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
                                             config.RPN_ANCHOR_RATIOS,
                                             backbone_shapes,
                                             config.BACKBONE_STRIDES,
                                             config.RPN_ANCHOR_STRIDE)
    print("initial2")
while True:
        try:
            # Increment index to pick next image. Shuffle if at the start of an epoch.
            print(image_index)
            image_index = (image_index + 1) % len(image_ids)
            if shuffle and image_index == 0:
                np.random.shuffle(image_ids)

            # Get GT bounding boxes and masks for image.
            image_id = image_ids[image_index]

            # If the image source is not to be augmented pass None as augmentation
            if dataset.image_info[image_id]['source'] in no_augmentation_sources:
                image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
                load_image_gt(dataset, config, image_id, augment=augment,
                              augmentation=None,
                              use_mini_mask=config.USE_MINI_MASK)
                print("no augment")
            else:
                image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
                    load_image_gt(dataset, config, image_id, augment=augment,
                                augmentation=augmentation,
                                use_mini_mask=config.USE_MINI_MASK)

            # Skip images that have no instances. This can happen in cases
            # where we train on a subset of classes and the image doesn't
            # have any of the classes we care about.
            if not np.any(gt_class_ids > 0):
                print("no instance")
                continue

            # RPN Targets
            rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,
                                                    gt_class_ids, gt_boxes, config)
            print("build rpn")

it only printed
Epoch 1/1
initial
initial2
-1
no instance
0
no instance
1
no instance
2
no instance
3
no instance
4
no instance
5
no instance
6
no instance
7
no instance
8
no instance
9
no instance
10
no instance
11
no instance
12
no instance
13
no instance
14
no instance
0
....
repeatedly
I did add self.add_class("Newton Ring", 1, "rainbow") in my code why there is still no any instance ?

I have also met the same problems and I found solution in
https://stackoverflow.com/questions/51176661/keras-seems-to-hang-after-call-to-fit-generator
Hope it may help.

Hey, did you find a solution?

I had the same problem and I solved it by downgrading the graphic card's driver.

Was this page helpful?
0 / 5 - 0 ratings

Related issues

canerozer picture canerozer  路  3Comments

wjdhuster2018 picture wjdhuster2018  路  3Comments

JonathanCMitchell picture JonathanCMitchell  路  3Comments

Mhaiyang picture Mhaiyang  路  4Comments

msson picture msson  路  4Comments