Spacy pretrain
I have a small CONLL dataset . I want to use BERT and ELMo for training. Does spacy provide any converter for BERT and ELMo vectors to be used during Spacy init command or for tok2vec during training?
I don't think spaCy does nor should it. Implementing this yourself should be rather easy. Are you interested in using the final representation of BERT/Elmo or only the output of their embedding layers?
To get vector representations from a pretrained BERT model, you can do the following. If you work with batches, you'll need to change things around a bit and include an attention mask. I highly suggest reading through the documentation of the PyTorch implementation.
from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
import torch
def init_bert(model_name):
model = BertModel.from_pretrained(model_name)
model.eval()
# If model is uncased, do_lower_case
do_lower_case = model_name.endswith('-uncased')
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
return model, tokenizer
def forward_bert(sentence, model, bert_tokenizer, max_length=30):
# tokenizer will also separate on punctuation
# see https://github.com/google-research/bert#tokenization
tokens = bert_tokenizer.tokenize(sentence)
# limit size of tokens (-2 to account for CLS and SEP)
if len(tokens) > max_length - 2:
tokens = tokens[0:(max_length - 2)]
# add [CLS] and [SEP], as expected in BERT
tokens = ['[CLS]', *tokens, '[SEP]']
# unsqueeze because BERT expects a batch
bert_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
bert_ids = torch.LongTensor(bert_ids).unsqueeze(1)
with torch.no_grad():
all_bert_layers, _ = model(bert_ids)
# in the BERT paper, best results were obtained by concatenating last 4 layers
bert_concat_last = torch.cat(tuple(all_bert_layers[-4:]), dim=-1)
# squeeze to get the first sentence (assuming we only submitted one sentence)
bert_sent_vec = bert_concat_last.squeeze()
return bert_sent_vec
if __name__ == '__main__':
bert_model, bert_tokenizer = init_bert('bert-base-uncased')
bert_vec = forward_bert('Hello beautiful world.', bert_model, bert_tokenizer)
print(bert_vec.size())
# torch.Size([6, 3072])
print(bert_vec)
For ELMo, it's a bit easier. The input has to be tokenized already, though.
from allennlp.modules.elmo import Elmo, batch_to_ids
import torch
def init_elmo(options, weight):
model = Elmo(options, weight, 1)
model.eval()
return model
def forward_elmo(tokens, model):
# Add <S> and </S> tokens to sentence
# See https://github.com/allenai/allennlp/blob/master/tutorials/how_to/elmo.md
# #notes-on-statefulness-and-non-determinism
tokens = [['<S>', *tokens, '</S>']]
# Batch of size 1 in this case
elmo_ids = batch_to_ids(tokens)
with torch.no_grad():
elmo_out = model(elmo_ids)
# only using one representation, so get it by first index
# squeeze because we only forwarded one sentence (batch_size=1)
elmo_sent_vec = elmo_out['elmo_representations'][0].squeeze()
return elmo_sent_vec
if __name__ == '__main__':
elmo_model = init_elmo('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json',
'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5')
elmo_vec = forward_elmo(['Hello', 'beautiful', 'world', '!'], elmo_model)
print(elmo_vec.size())
# torch.Size([6, 1024])
print(elmo_vec)
I don't think spaCy does nor should it. Implementing this yourself should be rather easy. Are you interested in using the final representation of BERT/Elmo or only the output of their embedding layers?
To get vector representations from a pretrained BERT model, you can do the following. If you work with batches, you'll need to change things around a bit and include an attention mask. I highly suggest reading through the documentation of the PyTorch implementation.
from pytorch_pretrained_bert.modeling import BertModel from pytorch_pretrained_bert.tokenization import BertTokenizer import torch def init_bert(model_name): model = BertModel.from_pretrained(model_name) # Freeze model for param in model.parameters(): param.requires_grad = False # If model is uncased, do_lower_case do_lower_case = model_name.endswith('-uncased') tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) return model, tokenizer def forward_bert(sentence, model, bert_tokenizer, max_length=30): # tokenizer will also separate on punctuation # see https://github.com/google-research/bert#tokenization tokens = bert_tokenizer.tokenize(sentence) # limit size of tokens (-2 to account for CLS and SEP) if len(tokens) > max_length - 2: tokens = tokens[0:(max_length - 2)] # add [CLS] and [SEP], as expected in BERT tokens = ['[CLS]', *tokens, '[SEP]'] # Unsqueeze because BERT expects a batch bert_ids = torch.LongTensor(bert_tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(1) all_bert_layers, _ = model(bert_ids) # In the BERT paper, best results were obtained by concatenating last 4 layers bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1) return bert_concat_last if __name__ == '__main__': bert_model, bert_tokenizer = init_bert('bert-base-uncased') bert_vec = forward_bert('Hello beautiful world.', bert_model, bert_tokenizer) print(bert_vec.size()) # torch.Size([6, 1, 3072]) print(bert_vec)For ELMo, it's a bit easier. The input has to be tokenized already, though.
from allennlp.modules.elmo import Elmo, batch_to_ids def init_elmo(options, weight): model = Elmo(options, weight, 1) # Freeze model for param in model.parameters(): param.requires_grad = False return model def forward_elmo(tokens, model): # Add <S> and </S> tokens to sentence # See https://github.com/allenai/allennlp/blob/master/tutorials/how_to/elmo.md # #notes-on-statefulness-and-non-determinism tokens = [['<S>', *tokens, '</S>']] # Batch of size 1 in this case elmo_ids = batch_to_ids(tokens) elmo_out = model(elmo_ids) # Only using one representation, so get it by first index elmo_repr = elmo_out['elmo_representations'][0] return elmo_repr if __name__ == '__main__': elmo_model = init_elmo('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json', 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5') elmo_vec = forward_elmo(['Hello', 'beautiful', 'world', '!'], elmo_model) print(elmo_vec.size()) # batch first output # torch.Size([1, 6, 1024]) print(elmo_vec)
Tried to use with bert. Getting the following error:
File "test.py", line 42, in
bert_vec = forward_bert('hello world', bert_model, bert_tokenizer)
File "test.py", line 35, in forward_bert
bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1)
RuntimeError: expected a non-empty list of Tensors
Tried to use with bert. Getting the following error:
File "test.py", line 42, in
bert_vec = forward_bert('hello world', bert_model, bert_tokenizer)
File "test.py", line 35, in forward_bert
bert_concat_last = torch.cat(tuple(all_bert_layers[-1:-4]), dim=-1)
RuntimeError: expected a non-empty list of Tensors
Oops, you are right! There was a typo at the slicing step, which should just be all_bert_layers[-4:]. I fixed it in the original post. It should work now.
Thanks. I'm trying to calculate the cosine similarity of two vectors, but the vector produced above is a 3 dim vector but cosine_similarity expects a 2 D vector. Any way we can reshape it?
Thanks. I'm trying to calculate the cosine similarity of two vectors, but the vector produced above is a 3 dim vector but cosine_similarity expects a 2 D vector. Any way we can reshape it?
Good point. Considering that we expect one sentence as input, it makes sense to also expect a 2-dim output. I have updated the snippet accordingly.
Tried it out, I'm getting a different error now:
Traceback (most recent call last):
File "test2.py", line 46, in
bert_model, bert_tokenizer = init_bert('bert-base-uncased')
File "test2.py", line 8, in init_bert
model = BertModel.from_pretrained(model_name)
File "/usr/local/lib/python3.7/site-packages/pytorch_pretrained_bert/modeling.py", line 590, in from_pretrained
archive.extractall(tempdir)
File "/usr/local/lib/python3.7/tarfile.py", line 2000, in extractall
numeric_owner=numeric_owner)
File "/usr/local/lib/python3.7/tarfile.py", line 2042, in extract
numeric_owner=numeric_owner)
File "/usr/local/lib/python3.7/tarfile.py", line 2112, in _extract_member
self.makefile(tarinfo, targetpath)
File "/usr/local/lib/python3.7/tarfile.py", line 2161, in makefile
copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
File "/usr/local/lib/python3.7/tarfile.py", line 247, in copyfileobj
buf = src.read(bufsize)
File "/usr/local/lib/python3.7/gzip.py", line 276, in read
return self._buffer.read(size)
File "/usr/local/lib/python3.7/_compression.py", line 68, in readinto
data = self.read(len(byte_view))
File "/usr/local/lib/python3.7/gzip.py", line 482, in read
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached
This may not be a problem with the modified script. Could this be a problem with the download?
Yes, looks like it. Find the location where the model was downloaded and delete it so you can download it again.
This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.
Most helpful comment
I don't think spaCy does nor should it. Implementing this yourself should be rather easy. Are you interested in using the final representation of BERT/Elmo or only the output of their embedding layers?
To get vector representations from a pretrained BERT model, you can do the following. If you work with batches, you'll need to change things around a bit and include an attention mask. I highly suggest reading through the documentation of the PyTorch implementation.
For ELMo, it's a bit easier. The input has to be tokenized already, though.