Gensim: Keep the period symbols when extracting articles from wikipedia by WikiCorpus.get_texts()

Created on 30 Nov 2015 · 4Comments · Source: RaRe-Technologies/gensim

I found that when extracting Wikipedia contents by gensim.corpora.wikicorpus.WikiCorpus.get_texts(), the punctuations, including the periods, are eliminated. Shouldn't the periods be included to identify the sentence endings? Thanks!

Source

hhchen1105

Most helpful comment

Thanks @tmylk. This is how I did it. Create a new file mywikicorpus.py with the following content.

Redefining only tokenize(content) does not work. You need to override get_texts(self) and process_article(args) otherwise the instance of subclass MyWikiCorpus would keep calling the tokenize(content) in the wikicorpus.py file.

I think putting all these methods in the body of the class makes it easier to extend the code. I am not sure what's the advantage of defining methods outside of the class if they are only being used by the class.

import sys
import os

parent = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent + '/../venv/lib/python2.7/site-packages/gensim/corpora/')

from wikicorpus import *

def tokenize(content):
    # override original method in wikicorpus.py
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if len(token) <= 15 and not token.startswith('_')]

def process_article(args):
   # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid


class MyWikiCorpus(WikiCorpus):
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)

    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length

rhazegh on 7 Feb 2017

👍7 😕1

All 4 comments

get_texts() returns lists of words.

If you want a different functionality, the easiest way is to subclass WikiCorpus and customize it to your needs.

The "raw texts" are coming from wikicorpus.extract_pages() and are processed by wikicorpus.process_article() -- customizing should be straightforward.

piskvorky on 30 Nov 2015

@piskvorky Thanks for clarification. I want to subclass WikiCorpus and override def tokenize(content) function.

from gensim.corpora import WikiCorpus
from gensim import utils as gensim_utils

class MyWikiCorpus(WikiCorpus):
    def __init__(self, fname, processes=None, lemmatize=gensim_utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)

def tokenize(content) function is not defined in the body of the WikiCorpus class. How can one override it?

rhazegh on 6 Feb 2017

@rhazegh One can define tokenize again using def tokenize after impoting WikiCorpus code file. That will override it.

tmylk on 6 Feb 2017

Thanks @tmylk. This is how I did it. Create a new file mywikicorpus.py with the following content.

import sys
import os

parent = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent + '/../venv/lib/python2.7/site-packages/gensim/corpora/')

from wikicorpus import *

def tokenize(content):
    # override original method in wikicorpus.py
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if len(token) <= 15 and not token.startswith('_')]

def process_article(args):
   # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid


class MyWikiCorpus(WikiCorpus):
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)

    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length

rhazegh on 7 Feb 2017

👍7 😕1

Was this page helpful?

0 / 5 - 0 ratings