Had to change
kb = KnowledgeBase(vocab=nlp.vocab)
to
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=64)
kb_pretrain.py file, I have made some changes to resolve error that I got after copying and pasting pretrain code from the guide pretrain_kb.py
#!/usr/bin/env python
# coding: utf8
"""Example of defining and (pre)training spaCy's knowledge base,
which is needed to implement entity linking functionality.
For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2
Last tested with: v2.2
"""
from __future__ import unicode_literals, print_function
import plac
from pathlib import Path
from spacy.vocab import Vocab
import spacy
from spacy.kb import KnowledgeBase
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
INPUT_DIM = 300 # dimension of pretrained input vectors
DESC_WIDTH = 64 # dimension of output entity vectors
@plac.annotations(
vocab_path=("Path to the vocab for the kb", "option", "v", Path),
model=("Model name, should have pretrained word embeddings", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
)
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
"""Load the model, create the KB and pretrain the entity encodings.
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
If an output_dir is provided, the KB will be stored there in a file 'kb'.
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
if model is None and vocab_path is None:
raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
vocab = Vocab().from_disk(vocab_path)
# create blank Language class with specified vocab
nlp = spacy.blank("en", vocab=vocab)
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=64)
# set up the data
entity_ids = []
descriptions = []
freqs = []
for key, value in ENTITIES.items():
desc, freq = value
entity_ids.append(key)
descriptions.append(desc)
freqs.append(freq)
# training entity description encodings
# this part can easily be replaced with a custom entity encoder
encoder = EntityEncoder(
nlp=nlp,
input_dim=INPUT_DIM,
desc_width=DESC_WIDTH,
)
encoder.train(description_list=descriptions, to_print=True)
# get the pretrained entity vectors
embeddings = encoder.apply_encoder(descriptions)
# set the entities, can also be done by calling `kb.add_entity` for each entity
kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)
# adding aliases, the entities need to be defined in the KB beforehand
kb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1
)
# test the trained model
print()
_print_kb(kb)
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
kb_path = str(output_dir / "kb")
kb.dump(kb_path)
print()
print("Saved KB to", kb_path)
# only storing the vocab if we weren't already reading it from file
if not vocab_path:
vocab_path = output_dir / "vocab"
kb.vocab.to_disk(vocab_path)
print("Saved vocab to", vocab_path)
print()
# test the saved model
# always reload a knowledge base with the same vocab instance!
print("Loading vocab from", vocab_path)
print("Loading KB from", kb_path)
vocab2 = Vocab().from_disk(vocab_path)
kb2 = KnowledgeBase(vocab=vocab2, entity_vector_length=64)
kb2.load_bulk(kb_path)
_print_kb(kb2)
print()
def _print_kb(kb):
print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings())
print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings())
if __name__ == "__main__":
plac.call(main)
# Expected output:
# 2 kb entities: ['Q2146908', 'Q7381115']
# 1 kb aliases: ['Russ Cochran']
running kb_train file with
python kb_train.py testKB/kb testKB/vocab -o trainedKB
Created blank 'en' model with vocab from 'testKB/vocab'
Loaded Knowledge Base from 'testKB/kb'
Traceback (most recent call last):
File "kb_train.py", line 155, in <module>
plac.call(main)
File "/home/geek/anaconda3/lib/python3.7/site-packages/plac_core.py", line 328, in call
cmd, result = parser.consume(arglist)
File "/home/geek/anaconda3/lib/python3.7/site-packages/plac_core.py", line 207, in consume
return cmd, self.func(*(args + varargs + extraopts), **kwargs)
File "kb_train.py", line 115, in main
sgd=optimizer,
File "/home/geek/anaconda3/lib/python3.7/site-packages/spacy/language.py", line 475, in update
proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
File "pipes.pyx", line 1191, in spacy.pipeline.pipes.EntityLinker.update
KeyError: '0_12'
It looks like the problem might be that you're running spaCy v2.1.8, but are trying to use the script compatible with v2.2 (see file header).
Here's the script for v2.1.8: https://github.com/explosion/spaCy/tree/v2.1.8/bin/wiki_entity_linking
The entity linking functionality is still new and updated frequently, though, so it's probably best to always use the latest version of spaCy if you want to experiment with entity linking.
@ines Yes, upgrading spacy solves the problem. Thanks.
This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.
Most helpful comment
It looks like the problem might be that you're running spaCy v2.1.8, but are trying to use the script compatible with v2.2 (see file header).
Here's the script for v2.1.8: https://github.com/explosion/spaCy/tree/v2.1.8/bin/wiki_entity_linking
The entity linking functionality is still new and updated frequently, though, so it's probably best to always use the latest version of spaCy if you want to experiment with entity linking.