Spacy: Loading NER model from dump doesn't work

Created on 28 Nov 2016 · 6Comments · Source: explosion/spaCy

I'm using the EntityRecognizer.load method to load my model. This is seems to run successfully, but the resulting EntityRecognizer doesn't work properly; whereas the original one does.

Whereas https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py outputs:

Who WP  2
is VBZ  2
Shaka NNP PERSON 3
Khan NNP PERSON 1
? .  2

The loaded one outputs:

$ python3 load_ner.py
Who WP  2
is VBZ  2
Shaka NNP  2
Khan NNP  2
? .  2

(notice the missing ent_type_!)

I've included code to repro here:
https://github.com/savvopoulos/spaCy/commit/943f979ce4b80837b873fd3d10091c6db5b4b484

docs usage

Source

savv

Most helpful comment

Hey,

Sorry about the lack of clarity on this. I think others have been having trouble with this too.

There are a few problems with your scripts.

EntityRecognizer.load() is a classmethod --- it returns a new instance of EntityRecognizer. It doesn't modify the instance in place. So, your line ner.load() does nothing. What you need is ner = EntityRecognizer.load()
You should save and load the vocabulary as well as the entity recognition model.
You should either save and load the tagger, or not use the tagger when running the NER.

These seem to work for me. I'll work on getting the docs fixed.

# Train NER

# encoding: utf8
from __future__ import unicode_literals, print_function
import ujson as json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger


def train_ner(nlp, train_data, entity_types):
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner


def main(model_dir=None):
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()

    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print('please run: `python -m spacy.en.download --force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [
        (
            'Who is Shaka Khan?',
            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
        ),
        (
            'I like London and Berlin.',
            [(len('I like '), len('I like London'), 'LOC'),
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

    doc = nlp.make_doc('Who is Shaka Khan?')
    #nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
        with (model_dir / 'config.json').open('wb') as file_:
            json.dump(ner.cfg, file_)
        ner.model.dump(str(model_dir / 'model'))
        if not (model_dir / 'vocab').exists():
            (model_dir / 'vocab').mkdir()
        ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
        with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
            ner.vocab.strings.dump(file_)


if __name__ == '__main__':
    main('ner')
    # Who "" 2
    # is "" 2
    # Shaka "" PERSON 3
    # Khan "" PERSON 1
    # ? "" 2

# Load NER
from __future__ import unicode_literals
import spacy
import pathlib
from spacy.pipeline import EntityRecognizer
from spacy.vocab import Vocab


nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
vocab_dir = pathlib.Path('ner/vocab')
with (vocab_dir / 'strings.json').open('r', encoding='utf8') as file_:
    nlp.vocab.strings.load(file_)
nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin')
ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=True)
doc = nlp.make_doc('Who is Shaka Khan?')
#nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, word.orth, word.lower, word.ent_type_)

honnibal on 1 Dec 2016

👍3 🎉2

All 6 comments

Hey,

Sorry about the lack of clarity on this. I think others have been having trouble with this too.

There are a few problems with your scripts.

EntityRecognizer.load() is a classmethod --- it returns a new instance of EntityRecognizer. It doesn't modify the instance in place. So, your line ner.load() does nothing. What you need is ner = EntityRecognizer.load()
You should save and load the vocabulary as well as the entity recognition model.
You should either save and load the tagger, or not use the tagger when running the NER.

These seem to work for me. I'll work on getting the docs fixed.

# Train NER

# encoding: utf8
from __future__ import unicode_literals, print_function
import ujson as json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger


def train_ner(nlp, train_data, entity_types):
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner


def main(model_dir=None):
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()

    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print('please run: `python -m spacy.en.download --force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [
        (
            'Who is Shaka Khan?',
            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
        ),
        (
            'I like London and Berlin.',
            [(len('I like '), len('I like London'), 'LOC'),
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

    doc = nlp.make_doc('Who is Shaka Khan?')
    #nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
        with (model_dir / 'config.json').open('wb') as file_:
            json.dump(ner.cfg, file_)
        ner.model.dump(str(model_dir / 'model'))
        if not (model_dir / 'vocab').exists():
            (model_dir / 'vocab').mkdir()
        ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
        with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
            ner.vocab.strings.dump(file_)


if __name__ == '__main__':
    main('ner')
    # Who "" 2
    # is "" 2
    # Shaka "" PERSON 3
    # Khan "" PERSON 1
    # ? "" 2

# Load NER
from __future__ import unicode_literals
import spacy
import pathlib
from spacy.pipeline import EntityRecognizer
from spacy.vocab import Vocab


nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
vocab_dir = pathlib.Path('ner/vocab')
with (vocab_dir / 'strings.json').open('r', encoding='utf8') as file_:
    nlp.vocab.strings.load(file_)
nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin')
ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=True)
doc = nlp.make_doc('Who is Shaka Khan?')
#nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, word.orth, word.lower, word.ent_type_)

honnibal on 1 Dec 2016

👍3 🎉2

Many thanks for your answer!

Re: 2
Why do I need to save + load the vocab, given that I'm using spacy's vocab?
I see you added this line: _ = nlp.vocab[word.orth]
Does this add ortho features for previously unknown words, and thus help with training?

Re: 3
Similarly, why save the tagger, given that I'm using spacy's?

Also:
Does having the tags/vectors for a document improve new NER's performance?

savv on 5 Dec 2016

BTW. If I use tagger.model.dump, I get:

Traceback (most recent call last):
  File "train_ner.py", line 82, in <module>
    main('ner')
  File "train_ner.py", line 77, in main
    nlp.tagger.model.dump(str(model_dir / 'pos' / 'model'))
  File "thinc/linear/avgtron.pyx", line 75, in thinc.linear.avgtron.AveragedPerceptron.dump (thinc/linear/avgtron.cpp:2754)
  File "thinc/linear/serialize.pyx", line 23, in thinc.linear.serialize.Writer.__init__ (thinc/linear/serialize.cpp:1521)
AssertionError

savv on 12 Dec 2016

Finally getting back on top of the backlog a bit.

Is this closable, given your PR #679 ?

honnibal on 9 Jan 2017

Yes, and they also still work under 1.5.

savv on 9 Jan 2017

This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.