Gensim: ldamodel does not accept csc matrix

Created on 14 Mar 2020 · 3Comments · Source: RaRe-Technologies/gensim

Docs say a scipy sparse csc matrix can be used but it can't. It works with sparse3corpus. Here is example:

import gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim.matutils import Sparse2Corpus
from random_word import RandomWords
#r = RandomWords()
# texts = []
# for x in range(10):
#     texts.append(" ".join(r.get_random_words(limit="200")))
vec = CountVectorizer()
docterm = vec.fit_transform(texts)
termdoc = docterm.T.tocsc()
#termdoc = Sparse2Corpus(termdoc)
id2word = {v:k for k,v in vec.vocabulary_.items()}
ldamodel = gensim.models.ldamodel.LdaModel(termdoc, 2, id2word=id2word, passes=10)

There are two different error messages depending on length of texts I think. Here are both:

ypeError Traceback (most recent call last)
in
12 #termdoc = Sparse2Corpus(termdoc)
13 id2word = {v:k for k,v in vec.vocabulary_.items()}
---> 14 ldamodel = gensim.models.ldamodel.LdaModel(termdoc, 2, id2word=id2word, passes=10)

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in __init__(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)
517 if corpus is not None:
518 use_numpy = self.dispatcher is not None
--> 519 self.update(corpus, chunks_as_numpy=use_numpy)
520
521 def init_dir_prior(self, prior, name):

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in update(self, corpus, chunksize, decay, offset, passes, update_every, eval_every, iterations, gamma_threshold, chunks_as_numpy)
978 pass_, chunk_no * chunksize + len(chunk), lencorpus
979 )
--> 980 gammat = self.do_estep(chunk, other)
981
982 if self.optimize_alpha:

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in do_estep(self, chunk, state)
740 if state is None:
741 state = self.state
--> 742 gamma, sstats = self.inference(chunk, collect_sstats=True)
743 state.sstats += sstats
744 state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in inference(self, chunk, collect_sstats)
669 epsilon = np.finfo(self.dtype).eps
670 for d, doc in enumerate(chunk):
--> 671 if len(doc) > 0 and not isinstance(doc[0][0], integer_types):
672 # make sure the term IDs are ints, otherwise np will get upset
673 ids = [int(idx) for idx, _ in doc]

~\Anaconda3\lib\site-packages\scipy\sparse\base.py in __len__(self)
293 # non-zeros is more important. For now, raise an exception!
294 def __len__(self):
--> 295 raise TypeError("sparse matrix length is ambiguous; use getnnz()"
296 " or shape[0]")
297

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

#

ValueError Traceback (most recent call last)
in
12 #termdoc = Sparse2Corpus(termdoc)
13 id2word = {v:k for k,v in vec.vocabulary_.items()}
---> 14 ldamodel = gensim.models.ldamodel.LdaModel(termdoc[:3], 2, id2word=id2word, passes=10)

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in update(self, corpus, chunksize, decay, offset, passes, update_every, eval_every, iterations, gamma_threshold, chunks_as_numpy)
963
964 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
--> 965 self.log_perplexity(chunk, total_docs=lencorpus)
966
967 if self.dispatcher:

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in log_perplexity(self, chunk, total_docs)
817 if total_docs is None:
818 total_docs = len(chunk)
--> 819 corpus_words = sum(cnt for document in chunk for _, cnt in document)
820 subsample_ratio = 1.0 * total_docs / len(chunk)
821 perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)

~\Anaconda3\lib\site-packages\gensim\models\ldamodel.py in (.0)
817 if total_docs is None:
818 total_docs = len(chunk)
--> 819 corpus_words = sum(cnt for document in chunk for _, cnt in document)
820 subsample_ratio = 1.0 * total_docs / len(chunk)
821 perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)

ValueError: not enough values to unpack (expected 2, got 1)

bug documentation

Source

simonm3

Most helpful comment

The result of Sparse2Corpus is just a standard gensim corpus, it's not a special type.

IMO we should get rid of the documentation that claims LDA accepts CSC matrices on input. Gensim accepts only standard streamed corpora = iterable of sparse vectors, where each sparse vector is a list (feature_id, feature_weight) 2-tuples.

We can mention that if you have a CSC in-memory matrix, you may convert it to a streamed corpus with the help of gensim.matutils.Sparse2Corpus.

@FyzHsn would you be able to fix this? Thanks.

piskvorky on 9 May 2020

👍3

All 3 comments

Hi I just tried it out and found the error mentioned above.

>>> termdoc = docterm.T.tocsc()
>>> type(termdoc)
<class 'scipy.sparse.csc.csc_matrix'>

I'm working on fixing the documentation: Should I replace scipy.sparse.csc -> gensim.matutils.Sparse2Corpus in the gensim.models.ldamodel docstrings?