Spacy: BERT and ELMo support during pre-training

Created on 23 May 2019 · 8Comments · Source: explosion/spaCy

Spacy pretrain
I have a small CONLL dataset . I want to use BERT and ELMo for training. Does spacy provide any converter for BERT and ELMo vectors to be used during Spacy init command or for tok2vec during training?

Info about spaCy

spaCy version: 2.1.4
Platform: Linux-4.15.0-50-generic-x86_64-with-debian-stretch-sid
Python version: 3.6.8
Models: en

enhancement feat / tok2vec

Source

gaurav-iiitm

Most helpful comment

I don't think spaCy does nor should it. Implementing this yourself should be rather easy. Are you interested in using the final representation of BERT/Elmo or only the output of their embedding layers?

To get vector representations from a pretrained BERT model, you can do the following. If you work with batches, you'll need to change things around a bit and include an attention mask. I highly suggest reading through the documentation of the PyTorch implementation.

from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
import torch


def init_bert(model_name):
    model = BertModel.from_pretrained(model_name)
    model.eval()

    # If model is uncased, do_lower_case
    do_lower_case = model_name.endswith('-uncased')
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

    return model, tokenizer


def forward_bert(sentence, model, bert_tokenizer, max_length=30):
    # tokenizer will also separate on punctuation
    # see https://github.com/google-research/bert#tokenization
    tokens = bert_tokenizer.tokenize(sentence)

    # limit size of tokens (-2 to account for CLS and SEP)
    if len(tokens) > max_length - 2:
        tokens = tokens[0:(max_length - 2)]

    # add [CLS] and [SEP], as expected in BERT
    tokens = ['[CLS]', *tokens, '[SEP]']

    # unsqueeze because BERT expects a batch
    bert_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
    bert_ids = torch.LongTensor(bert_ids).unsqueeze(1)

    with torch.no_grad():
        all_bert_layers, _ = model(bert_ids)

    # in the BERT paper, best results were obtained by concatenating last 4 layers
    bert_concat_last = torch.cat(tuple(all_bert_layers[-4:]), dim=-1)
    # squeeze to get the first sentence (assuming we only submitted one sentence)
    bert_sent_vec = bert_concat_last.squeeze()

    return bert_sent_vec


if __name__ == '__main__':
    bert_model, bert_tokenizer = init_bert('bert-base-uncased')
    bert_vec = forward_bert('Hello beautiful world.', bert_model, bert_tokenizer)

    print(bert_vec.size())
    # torch.Size([6, 3072])
    print(bert_vec)

For ELMo, it's a bit easier. The input has to be tokenized already, though.

from allennlp.modules.elmo import Elmo, batch_to_ids
import torch


def init_elmo(options, weight):
    model = Elmo(options, weight, 1)
    model.eval()

    return model


def forward_elmo(tokens, model):
    # Add <S> and </S> tokens to sentence
    # See https://github.com/allenai/allennlp/blob/master/tutorials/how_to/elmo.md
    # #notes-on-statefulness-and-non-determinism
    tokens = [['<S>', *tokens, '</S>']]
    # Batch of size 1 in this case
    elmo_ids = batch_to_ids(tokens)

    with torch.no_grad():
        elmo_out = model(elmo_ids)

    # only using one representation, so get it by first index
    # squeeze because we only forwarded one sentence (batch_size=1)
    elmo_sent_vec = elmo_out['elmo_representations'][0].squeeze()

    return elmo_sent_vec


if __name__ == '__main__':
    elmo_model = init_elmo('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json',
                           'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5')
    elmo_vec = forward_elmo(['Hello', 'beautiful', 'world', '!'], elmo_model)

    print(elmo_vec.size())
    # torch.Size([6, 1024])
    print(elmo_vec)

BramVanroy on 2 Jun 2019

👍11 ❤6

All 8 comments

from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
import torch


def init_bert(model_name):
    model = BertModel.from_pretrained(model_name)
    model.eval()

    # If model is uncased, do_lower_case
    do_lower_case = model_name.endswith('-uncased')
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

    return model, tokenizer


def forward_bert(sentence, model, bert_tokenizer, max_length=30):
    # tokenizer will also separate on punctuation
    # see https://github.com/google-research/bert#tokenization
    tokens = bert_tokenizer.tokenize(sentence)

    # limit size of tokens (-2 to account for CLS and SEP)
    if len(tokens) > max_length - 2:
        tokens = tokens[0:(max_length - 2)]

    # add [CLS] and [SEP], as expected in BERT
    tokens = ['[CLS]', *tokens, '[SEP]']

    # unsqueeze because BERT expects a batch
    bert_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
    bert_ids = torch.LongTensor(bert_ids).unsqueeze(1)

    with torch.no_grad():
        all_bert_layers, _ = model(bert_ids)

    # in the BERT paper, best results were obtained by concatenating last 4 layers
    bert_concat_last = torch.cat(tuple(all_bert_layers[-4:]), dim=-1)
    # squeeze to get the first sentence (assuming we only submitted one sentence)
    bert_sent_vec = bert_concat_last.squeeze()

    return bert_sent_vec


if __name__ == '__main__':
    bert_model, bert_tokenizer = init_bert('bert-base-uncased')
    bert_vec = forward_bert('Hello beautiful world.', bert_model, bert_tokenizer)

    print(bert_vec.size())
    # torch.Size([6, 3072])
    print(bert_vec)

For ELMo, it's a bit easier. The input has to be tokenized already, though.

from allennlp.modules.elmo import Elmo, batch_to_ids
import torch


def init_elmo(options, weight):
    model = Elmo(options, weight, 1)
    model.eval()

    return model


def forward_elmo(tokens, model):
    # Add <S> and </S> tokens to sentence
    # See https://github.com/allenai/allennlp/blob/master/tutorials/how_to/elmo.md
    # #notes-on-statefulness-and-non-determinism
    tokens = [['<S>', *tokens, '</S>']]
    # Batch of size 1 in this case
    elmo_ids = batch_to_ids(tokens)

    with torch.no_grad():
        elmo_out = model(elmo_ids)

    # only using one representation, so get it by first index
    # squeeze because we only forwarded one sentence (batch_size=1)
    elmo_sent_vec = elmo_out['elmo_representations'][0].squeeze()

    return elmo_sent_vec


if __name__ == '__main__':
    elmo_model = init_elmo('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json',
                           'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5')
    elmo_vec = forward_elmo(['Hello', 'beautiful', 'world', '!'], elmo_model)

    print(elmo_vec.size())
    # torch.Size([6, 1024])
    print(elmo_vec)

BramVanroy on 2 Jun 2019

👍11 ❤6

from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
import torch


def init_bert(model_name):
    model = BertModel.from_pretrained(model_name)

    # Freeze model
    for param in model.parameters():
        param.requires_grad = False

    # If model is uncased, do_lower_case
    do_lower_case = model_name.endswith('-uncased')
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

    return model, tokenizer

def forward_bert(sentence, model, bert_tokenizer, max_length=30):
    # tokenizer will also separate on punctuation
    # see https://github.com/google-research/bert#tokenization
    tokens = bert_tokenizer.tokenize(sentence)

    # limit size of tokens (-2 to account for CLS and SEP)
    if len(tokens) > max_length - 2:
        tokens = tokens[0:(max_length - 2)]

    # add [CLS] and [SEP], as expected in BERT
    tokens = ['[CLS]', *tokens, '[SEP]']

    # Unsqueeze because BERT expects a batch
    bert_ids = torch.LongTensor(bert_tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(1)

    all_bert_layers, _ = model(bert_ids)
    # In the BERT paper, best results were obtained by concatenating last 4 layers
    bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1)

    return bert_concat_last


if __name__ == '__main__':
    bert_model, bert_tokenizer = init_bert('bert-base-uncased')
    bert_vec = forward_bert('Hello beautiful world.', bert_model, bert_tokenizer)

    print(bert_vec.size())
    # torch.Size([6, 1, 3072])
    print(bert_vec)

For ELMo, it's a bit easier. The input has to be tokenized already, though.

from allennlp.modules.elmo import Elmo, batch_to_ids


def init_elmo(options, weight):
    model = Elmo(options, weight, 1)
    # Freeze model
    for param in model.parameters():
        param.requires_grad = False
    return model

def forward_elmo(tokens, model):
    # Add <S> and </S> tokens to sentence
    # See https://github.com/allenai/allennlp/blob/master/tutorials/how_to/elmo.md
    # #notes-on-statefulness-and-non-determinism
    tokens = [['<S>', *tokens, '</S>']]
    # Batch of size 1 in this case
    elmo_ids = batch_to_ids(tokens)
    elmo_out = model(elmo_ids)
    # Only using one representation, so get it by first index
    elmo_repr = elmo_out['elmo_representations'][0]

    return elmo_repr


if __name__ == '__main__':
    elmo_model = init_elmo('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json',
                           'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5')
    elmo_vec = forward_elmo(['Hello', 'beautiful', 'world', '!'], elmo_model)

    print(elmo_vec.size())
    # batch first output
    # torch.Size([1, 6, 1024])
    print(elmo_vec)

Tried to use with bert. Getting the following error:
File "test.py", line 42, in
bert_vec = forward_bert('hello world', bert_model, bert_tokenizer)
File "test.py", line 35, in forward_bert
bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1)
RuntimeError: expected a non-empty list of Tensors

gayatrivenugopal on 17 Jun 2019

Tried to use with bert. Getting the following error:
File "test.py", line 42, in
bert_vec = forward_bert('hello world', bert_model, bert_tokenizer)
File "test.py", line 35, in forward_bert
bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1)
RuntimeError: expected a non-empty list of Tensors

Oops, you are right! There was a typo at the slicing step, which should just be all_bert_layers[-4:]. I fixed it in the original post. It should work now.

BramVanroy on 17 Jun 2019

👍1

Thanks. I'm trying to calculate the cosine similarity of two vectors, but the vector produced above is a 3 dim vector but cosine_similarity expects a 2 D vector. Any way we can reshape it?

gayatrivenugopal on 17 Jun 2019

Thanks. I'm trying to calculate the cosine similarity of two vectors, but the vector produced above is a 3 dim vector but cosine_similarity expects a 2 D vector. Any way we can reshape it?

Good point. Considering that we expect one sentence as input, it makes sense to also expect a 2-dim output. I have updated the snippet accordingly.

BramVanroy on 17 Jun 2019

Tried it out, I'm getting a different error now:
Traceback (most recent call last):
File "test2.py", line 46, in
bert_model, bert_tokenizer = init_bert('bert-base-uncased')
File "test2.py", line 8, in init_bert
model = BertModel.from_pretrained(model_name)
File "/usr/local/lib/python3.7/site-packages/pytorch_pretrained_bert/modeling.py", line 590, in from_pretrained
archive.extractall(tempdir)
File "/usr/local/lib/python3.7/tarfile.py", line 2000, in extractall
numeric_owner=numeric_owner)
File "/usr/local/lib/python3.7/tarfile.py", line 2042, in extract
numeric_owner=numeric_owner)
File "/usr/local/lib/python3.7/tarfile.py", line 2112, in _extract_member
self.makefile(tarinfo, targetpath)
File "/usr/local/lib/python3.7/tarfile.py", line 2161, in makefile
copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
File "/usr/local/lib/python3.7/tarfile.py", line 247, in copyfileobj
buf = src.read(bufsize)
File "/usr/local/lib/python3.7/gzip.py", line 276, in read
return self._buffer.read(size)
File "/usr/local/lib/python3.7/_compression.py", line 68, in readinto
data = self.read(len(byte_view))
File "/usr/local/lib/python3.7/gzip.py", line 482, in read
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached