Gensim: Mallet wrapper raises RuntimeError: invalid doc topics format when prefix path is passed

Created on 24 May 2019 · 1Comment · Source: RaRe-Technologies/gensim

Problem description

I am loading a saved mallet wrapper model to generate topics for given text. I pass prefix when I train the model. When I load the model, I am getting runtimeerror raised by read_doctopics() in ldamallet.py module.

Steps/code/corpus to reproduce

import pandas as pd
import numpy as np
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import os
import joblib

lemmatized_words = [['cold', 'spring',  'april',  'united',  'state',  'outstanding',  'potential',  become',  'world',  'leader',  'wind',  'energy'],  ['lag',  'china',  'european',  'union',  'china',  currently',  'lead',  'world',  'wind',  'power',  'country',  'project',  'percent',  'energy',  'market',  supply'],  ['renewable',  'energy',  'accord',  'wind',  'europe',  'report',  'wind',  'power',  'european',  union',  'generate',  'enough',  'electricity',  'support',  'percent',  'total',  'energy'], [ 'consumption',  french',  'president',  'emmanuel',  'macron',  'promise',  'france'], [ 'alone',  'would',  'triple',  'wind',  power',  'capacity',  'unite',  'state',  'produce',  'percent',  'total',  'electricity',  'wind',  'power',  fall',  china',  'catch',  'answer',  'may',  'seem',  'simple'],  ['instal',  'wind',  'farm',  'optimize',  performance',  'wind',  'turbine',  'already',  'create',  'job',  'strong',  'diverse',  'economy',  sustainable',  'future',  'come',  'generation',  'cloudvisit'],  ['invest',  'vision',  'develop',  'wind',  turbine',  'maintenance',  'software',  'make',  'future',  'possible',  'wind',  'growth',  'unite',  'state',  wind',  'energy',  'growing',  'hold',  'notable',  'potential',  'nation'],  ['first',  'utility',  'scale',  offshore',  'wind',  'energy',  'project',  'vineyard',  'wind',  'recently',  'award',  'year',  'contract',  provide',  'electricity',  'approximately',  'third',  'cost',  'renewable',  'cloudvisit'], ['wind',  'turbine',  maintenance',  'software',  'help',  'support',  'renewable',  'affordable',  'electricity',  'nationwide',  cloudvisit',  'wind',  'turbine',  'maintenance',  'software',  'enable',  'remote',  'inspection',  'wind',  farm',  'inspector']]

def find_optimum_model(lemmatized_words):
    id2word = corpora.Dictionary(lemmatized_words)
    all_corpus = [id2word.doc2bow(text) for text in lemmatized_words]

    def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=3):
        coherence_values = []
        model_list = []

        #For two lines below update with your path to new_mallet
        os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8'})
        mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'
        prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
                                             prefix=prefix_path, random_seed=42)

            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values

    model_list, coherence_values = compute_coherence_values(dictionary=id2word, all_corpus=all_corpus,texts=lemmatized_words,start=2,
                                                            limit=40, step=6)
    model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values})

    optimal_model = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['model_list']

    joblib.dump(all_corpus,'corpus.pkl')
    joblib.dump(id2word,'id2word_dictionary_mallet.pkl')
    joblib.dump(optimal_model,'optimal_ldamallet_model.pkl')

def generate_dominant_topic(lemmatized_words):
    id2word = joblib.load('id2word_dictionary_mallet.pkl')
    new_corpus = [id2word.doc2bow(text) for text in lemmatized_words]
    optimal_model = joblib.load('optimal_ldamallet_model.pkl')


    def format_topics_sentences(ldamodel, new_corpus):
        sent_topics_df = pd.DataFrame()
        for i, row in enumerate(ldamodel[new_corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]),
                                                           ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
        return (sent_topics_df)

    df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, new_corpus=new_corpus)
    return (df_topic_sent_keywords)


find_optimum_model(lemmatized_words)

generate_dominant_topic(lemmmatized_words)

Include full tracebacks, logs and datasets if necessary. Please keep the examples minimal ("minimal reproducible example").

RuntimeError Traceback (most recent call last)
in
----> 1 generate_dominant_topic(lemmatized_words)

in generate_dominant_topic(lemmatized_words)
55 return (sent_topics_df)
56
---> 57 df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, new_corpus=new_corpus)
58 return (df_topic_sent_keywords)

in format_topics_sentences(ldamodel, new_corpus)
42 def format_topics_sentences(ldamodel, new_corpus):
43 sent_topics_df = pd.DataFrame()
---> 44 for i, row in enumerate(ldamodel[new_corpus]):
45 row = sorted(row, key=lambda x: (x[1]), reverse=True)
46 for j, (topic_num, prop_topic) in enumerate(row):

~AppDataLocalContinuumanaconda3libsite-packagesgensimmodelswrappersldamallet.py in __getitem__(self, bow, iterations)
323 logger.info("inferring topics with MALLET LDA '%s'", cmd)
324 check_output(args=cmd, shell=True)
--> 325 result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
326 return result if is_corpus else result[0]
327

~AppDataLocalContinuumanaconda3libsite-packagesgensimmodelswrappersldamallet.py in read_doctopics(self, fname, eps, renorm)
562 count += 1
563 else:
--> 564 raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname))
565
566 if renorm:

RuntimeError: invalid doc topics format at line 2 in C:\users\axk0er8\Sentiment_Analysis_Working\NewsSentimentAnalysis\mallet_temp\doctopics.txt.infer

Versions

Please provide the output of:

import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import gensim; print("gensim", gensim.__version__)
from gensim.models import word2vec;print("FAST_VERSION", word2vec.FAST_VERSION)

Windows-10-10.0.16299-SP0
Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
NumPy 1.16.2
SciPy 1.2.1
gensim 3.7.3
FAST_VERSION -1

Source

aashishkhadka1992

Most helpful comment

I have found the reason for this particular error within below block of code:

def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=3):
        coherence_values = []
        model_list = []

        #For two lines below update with your path to new_mallet
        os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8'})
        mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'
        prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
                                             prefix=prefix_path, random_seed=42)

            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values

When this function is executed, temp files are updated with every loop and the final temp files are ones for the final loop. But the optimal model is not always the last model. So, there arises mismatch in the model parameters and stored temp files. I made few changes to fix this error. Below is the code:
```python
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=4):
coherence_values = []
model_list = []
num_topics_list = []

        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
                                             random_seed=42)
            #model = gensim.models.ldamodel.LdaModel(corpus=all_corpus,num_topics=num_topics,id2word=dictionary,eval_every=1,
            #                                        alpha='auto',random_state=42)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
            num_topics_list.append(num_topics)

        return model_list, coherence_values, num_topics_list

    model_list, coherence_values, num_topics_list = compute_coherence_values(dictionary=id2word,all_corpus=all_corpus,
                                                                             texts=lemmatized_words,start=5,limit=40, step=6)
    model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values,'num_topics':num_topics_list})

    optimal_num_topics = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['num_topics']

    optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=optimal_num_topics, id2word=id2word,
                                                     prefix=prefix_path, random_seed=42)

    joblib.dump(id2word,'id2word_dictionary_mallet.pkl')
    joblib.dump(optimal_model,'optimal_ldamallet_model.pkl')```

aashishkhadka1992 on 28 May 2019

👍2 🚀1 ❤1

>All comments

I have found the reason for this particular error within below block of code:

def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=3):
        coherence_values = []
        model_list = []

        #For two lines below update with your path to new_mallet
        os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8'})
        mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'
        prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
                                             prefix=prefix_path, random_seed=42)

            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values

        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
                                             random_seed=42)
            #model = gensim.models.ldamodel.LdaModel(corpus=all_corpus,num_topics=num_topics,id2word=dictionary,eval_every=1,
            #                                        alpha='auto',random_state=42)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
            num_topics_list.append(num_topics)

        return model_list, coherence_values, num_topics_list

    model_list, coherence_values, num_topics_list = compute_coherence_values(dictionary=id2word,all_corpus=all_corpus,
                                                                             texts=lemmatized_words,start=5,limit=40, step=6)
    model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values,'num_topics':num_topics_list})

    optimal_num_topics = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['num_topics']

    optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=optimal_num_topics, id2word=id2word,
                                                     prefix=prefix_path, random_seed=42)

    joblib.dump(id2word,'id2word_dictionary_mallet.pkl')
    joblib.dump(optimal_model,'optimal_ldamallet_model.pkl')```

aashishkhadka1992 on 28 May 2019

👍2 🚀1 ❤1

Was this page helpful?

0 / 5 - 0 ratings