I am loading a saved mallet wrapper model to generate topics for given text. I pass prefix when I train the model. When I load the model, I am getting runtimeerror raised by read_doctopics() in ldamallet.py module.
import pandas as pd
import numpy as np
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import os
import joblib
lemmatized_words = [['cold', 'spring', 'april', 'united', 'state', 'outstanding', 'potential', become', 'world', 'leader', 'wind', 'energy'], ['lag', 'china', 'european', 'union', 'china', currently', 'lead', 'world', 'wind', 'power', 'country', 'project', 'percent', 'energy', 'market', supply'], ['renewable', 'energy', 'accord', 'wind', 'europe', 'report', 'wind', 'power', 'european', union', 'generate', 'enough', 'electricity', 'support', 'percent', 'total', 'energy'], [ 'consumption', french', 'president', 'emmanuel', 'macron', 'promise', 'france'], [ 'alone', 'would', 'triple', 'wind', power', 'capacity', 'unite', 'state', 'produce', 'percent', 'total', 'electricity', 'wind', 'power', fall', china', 'catch', 'answer', 'may', 'seem', 'simple'], ['instal', 'wind', 'farm', 'optimize', performance', 'wind', 'turbine', 'already', 'create', 'job', 'strong', 'diverse', 'economy', sustainable', 'future', 'come', 'generation', 'cloudvisit'], ['invest', 'vision', 'develop', 'wind', turbine', 'maintenance', 'software', 'make', 'future', 'possible', 'wind', 'growth', 'unite', 'state', wind', 'energy', 'growing', 'hold', 'notable', 'potential', 'nation'], ['first', 'utility', 'scale', offshore', 'wind', 'energy', 'project', 'vineyard', 'wind', 'recently', 'award', 'year', 'contract', provide', 'electricity', 'approximately', 'third', 'cost', 'renewable', 'cloudvisit'], ['wind', 'turbine', maintenance', 'software', 'help', 'support', 'renewable', 'affordable', 'electricity', 'nationwide', cloudvisit', 'wind', 'turbine', 'maintenance', 'software', 'enable', 'remote', 'inspection', 'wind', farm', 'inspector']]
def find_optimum_model(lemmatized_words):
id2word = corpora.Dictionary(lemmatized_words)
all_corpus = [id2word.doc2bow(text) for text in lemmatized_words]
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=3):
coherence_values = []
model_list = []
#For two lines below update with your path to new_mallet
os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8'})
mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'
prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
for num_topics in range(start, limit, step):
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
prefix=prefix_path, random_seed=42)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=id2word, all_corpus=all_corpus,texts=lemmatized_words,start=2,
limit=40, step=6)
model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values})
optimal_model = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['model_list']
joblib.dump(all_corpus,'corpus.pkl')
joblib.dump(id2word,'id2word_dictionary_mallet.pkl')
joblib.dump(optimal_model,'optimal_ldamallet_model.pkl')
def generate_dominant_topic(lemmatized_words):
id2word = joblib.load('id2word_dictionary_mallet.pkl')
new_corpus = [id2word.doc2bow(text) for text in lemmatized_words]
optimal_model = joblib.load('optimal_ldamallet_model.pkl')
def format_topics_sentences(ldamodel, new_corpus):
sent_topics_df = pd.DataFrame()
for i, row in enumerate(ldamodel[new_corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0:
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]),
ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
return (sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, new_corpus=new_corpus)
return (df_topic_sent_keywords)
find_optimum_model(lemmatized_words)
generate_dominant_topic(lemmmatized_words)
RuntimeError Traceback (most recent call last)
----> 1 generate_dominant_topic(lemmatized_words)
55 return (sent_topics_df)
56
---> 57 df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, new_corpus=new_corpus)
58 return (df_topic_sent_keywords)
42 def format_topics_sentences(ldamodel, new_corpus):
43 sent_topics_df = pd.DataFrame()
---> 44 for i, row in enumerate(ldamodel[new_corpus]):
45 row = sorted(row, key=lambda x: (x[1]), reverse=True)
46 for j, (topic_num, prop_topic) in enumerate(row):
~AppDataLocalContinuumanaconda3libsite-packagesgensimmodelswrappersldamallet.py in __getitem__(self, bow, iterations)
323 logger.info("inferring topics with MALLET LDA '%s'", cmd)
324 check_output(args=cmd, shell=True)
--> 325 result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
326 return result if is_corpus else result[0]
327
~AppDataLocalContinuumanaconda3libsite-packagesgensimmodelswrappersldamallet.py in read_doctopics(self, fname, eps, renorm)
562 count += 1
563 else:
--> 564 raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname))
565
566 if renorm:
RuntimeError: invalid doc topics format at line 2 in C:\users\axk0er8\Sentiment_Analysis_Working\NewsSentimentAnalysis\mallet_temp\doctopics.txt.infer
Please provide the output of:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import gensim; print("gensim", gensim.__version__)
from gensim.models import word2vec;print("FAST_VERSION", word2vec.FAST_VERSION)
Windows-10-10.0.16299-SP0
Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
NumPy 1.16.2
SciPy 1.2.1
gensim 3.7.3
FAST_VERSION -1
I have found the reason for this particular error within below block of code:
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=3):
coherence_values = []
model_list = []
#For two lines below update with your path to new_mallet
os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8'})
mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'
prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
for num_topics in range(start, limit, step):
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
prefix=prefix_path, random_seed=42)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
When this function is executed, temp files are updated with every loop and the final temp files are ones for the final loop. But the optimal model is not always the last model. So, there arises mismatch in the model parameters and stored temp files. I made few changes to fix this error. Below is the code:
```python
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=4):
coherence_values = []
model_list = []
num_topics_list = []
for num_topics in range(start, limit, step):
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary,
random_seed=42)
#model = gensim.models.ldamodel.LdaModel(corpus=all_corpus,num_topics=num_topics,id2word=dictionary,eval_every=1,
# alpha='auto',random_state=42)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
num_topics_list.append(num_topics)
return model_list, coherence_values, num_topics_list
model_list, coherence_values, num_topics_list = compute_coherence_values(dictionary=id2word,all_corpus=all_corpus,
texts=lemmatized_words,start=5,limit=40, step=6)
model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values,'num_topics':num_topics_list})
optimal_num_topics = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['num_topics']
optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=optimal_num_topics, id2word=id2word,
prefix=prefix_path, random_seed=42)
joblib.dump(id2word,'id2word_dictionary_mallet.pkl')
joblib.dump(optimal_model,'optimal_ldamallet_model.pkl')```
Most helpful comment
I have found the reason for this particular error within below block of code:
When this function is executed, temp files are updated with every loop and the final temp files are ones for the final loop. But the optimal model is not always the last model. So, there arises mismatch in the model parameters and stored temp files. I made few changes to fix this error. Below is the code:
```python
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=4):
coherence_values = []
model_list = []
num_topics_list = []