Regardless of trying out several answers from stackoverflow etc. I'm still getting the "word not found in vocabulary"
documents = [['༄༅། '],
['ལྷ་ བྲག་ ཐུགས་ སྒྲུབ་ དྲག་པོ་ རྩལ་ གྱི་ སྨིན་ ལམ་ དབང་ གི་ ཆུ་བོ་ ཆེན་ མོ་ ཁྱེར་ བདེར་ བསྡེབས་པ་ རིན་ཆེན་ ཕྲ་ མཛེས་ ཞེས་ བྱ་བ་ བཞུགས་ སོ'],
[' ན་མོ་ གུ་རུ་ ཀྲོ་ ཤྲཱི་ ཧེ་ རུ་ ཀཱ་'],
[' རིག་རྩལ་ དབང་ ཐོབ་ རྡོ་རྗེ་ དྲག་པོ་ རྩལ'],
['དཀོན་མཆོག་ རྩ་ གསུམ་ ཡོངས་འདུས་ ཞབས་ པདྨོ'],
['མི་ འབྲལ་ གི་ ཐིག་ལེར་ བཀོད་པ་ ལ'],
['རྡོ་རྗེའི་ བྱིན་ ཕོབ་ བཀའ་ ཡི་ གནང་ སྩོལ'],
['བོད་ ཁམས་ སྐྱོབ་ བྱེད་ མཛོད་ ལྔའི་ ལྷོ་ཕྱོགས་ བཅུད'],
['དྲག་རྩལ་ ཐུགས་ སྒྲུབ་ ཆོས་ཚན་ ཉེར་ ལྔ་ པའི'],
['བདུན་ པ་ ལས་ཅན་ སྨིན་ ལམ་ ཐེམས་ ཀྱི']]
model = gensim.models.Word2Vec(
documents,
size=150,
window=10,
min_count=1,
workers=10)
model.train(documents, total_examples=len(documents), epochs=10)
model['གུ་རུ་']
KeyError Traceback (most recent call last)
<ipython-input-153-ed3ff6a264de> in <module>()
26 model.train(documents, total_examples=len(documents), epochs=10)
27
---> 28 model['གུ་རུ་']
~/dev/astetik_test/lib/python3.6/site-packages/gensim/utils.py in new_func1(*args, **kwargs)
1396 stacklevel=2
1397 )
-> 1398 return func(*args, **kwargs)
1399
1400 return new_func1
~/dev/astetik_test/lib/python3.6/site-packages/gensim/models/word2vec.py in __getitem__(self, words)
819 Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`
820 """
--> 821 return self.wv.__getitem__(words)
822
823 @deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead")
~/dev/astetik_test/lib/python3.6/site-packages/gensim/models/keyedvectors.py in __getitem__(self, entities)
167 if isinstance(entities, string_types):
168 # allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
--> 169 return self.get_vector(entities)
170
171 return vstack([self.get_vector(entity) for entity in entities])
~/dev/astetik_test/lib/python3.6/site-packages/gensim/models/keyedvectors.py in get_vector(self, word)
275
276 def get_vector(self, word):
--> 277 return self.word_vec(word)
278
279 def words_closer_than(self, w1, w2):
~/dev/astetik_test/lib/python3.6/site-packages/gensim/models/keyedvectors.py in word_vec(self, word, use_norm)
272 return result
273 else:
--> 274 raise KeyError("word '%s' not in vocabulary" % word)
275
276 def get_vector(self, word):
KeyError: "word 'གུ་རུ་' not in vocabulary"
Darwin-15.6.0-x86_64-i386-64bit
Python 3.6.5 (default, Mar 30 2018, 06:41:49)
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.42.1)]
NumPy 1.14.2
SciPy 1.0.1
gensim 3.4.0
FAST_VERSION 0
ok, I figured it out. The docstring actually says it clear enough, it's just the various infos online don't. So it has to be a list of lists where each list item is token (word).
i have the same problem.can u plz help me
@aimaaonline can you share the contents of your documents list object. You probably don't have it as list of lists where each list item is a token (probably you have several tokens/word per list item).
Most helpful comment
ok, I figured it out. The docstring actually says it clear enough, it's just the various infos online don't. So it has to be a list of lists where each list item is token (word).