Tpot: Add support for scikit-learn Pipeline memory

Created on 29 Sep 2017 · 4Comments · Source: EpistasisLab/tpot

scikit-learn now has Pipeline memory that allows intermediate transformations to be cached. We should add this as an option to TPOT as another way to speed up the pipeline optimization process.

This feature will probably help speed up the TPOT-MDR configuration as well.

being worked on enhancement

Source

rhiever

👍1

Most helpful comment

The demo below is for testing both stacking and combine op:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import FunctionTransformer
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from copy import copy
from tpot.builtins import StackingEstimator

# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

cached_pipe = make_pipeline(
    make_union(
        StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
        FunctionTransformer(copy)
    ),
    LinearSVC(),
    memory=memory
)


N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 5, 10, 100, 1000]
param_grid = [
    {
        'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
        'linearsvc__C': C_OPTIONS
    },
]

# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)

cached_pipe_2 = make_pipeline(
    make_union(
        StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
        FunctionTransformer(copy)
    ),
    ExtraTreesClassifier(n_estimators=100),
    memory=memory
)

param_grid = [
    {
        'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
        'extratreesclassifier__max_depth': [1, 3, 5, 10]
    },
]

# This time, a cached pipeline will be used within the grid search
print("A cached pipeline will be used within the grid search")
grid = GridSearchCV(cached_pipe_2, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)



# Delete the temporary cache before exiting
rmtree(cachedir)

weixuanfu on 2 Oct 2017

👍2

All 4 comments

Need to test if pipeline caching works with:

[x] stacking
[x] combine op
[x] relief algorithms.

rhiever on 2 Oct 2017

The demo below is for testing both stacking and combine op:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import FunctionTransformer
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from copy import copy
from tpot.builtins import StackingEstimator

# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

cached_pipe = make_pipeline(
    make_union(
        StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
        FunctionTransformer(copy)
    ),
    LinearSVC(),
    memory=memory
)


N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 5, 10, 100, 1000]
param_grid = [
    {
        'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
        'linearsvc__C': C_OPTIONS
    },
]

# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)

cached_pipe_2 = make_pipeline(
    make_union(
        StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
        FunctionTransformer(copy)
    ),
    ExtraTreesClassifier(n_estimators=100),
    memory=memory
)

param_grid = [
    {
        'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
        'extratreesclassifier__max_depth': [1, 3, 5, 10]
    },
]

# This time, a cached pipeline will be used within the grid search
print("A cached pipeline will be used within the grid search")
grid = GridSearchCV(cached_pipe_2, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)



# Delete the temporary cache before exiting
rmtree(cachedir)

weixuanfu on 2 Oct 2017

👍2

Relief works too!

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2, RFE
from skrebate import ReliefF
from sklearn.ensemble import ExtraTreesClassifier

from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

cached_pipe = Pipeline([('reduce_dim', ReliefF()),
                        ('classify', LinearSVC())],
                       memory=memory)

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 5, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]

# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=3)
digits = load_digits()
grid.fit(digits.data, digits.target)


cached_pipe = Pipeline([('reduce_dim', ReliefF()),
                        ('classify', ExtraTreesClassifier(n_estimators=100))],
                       memory=memory)

N_FEATURES_OPTIONS = [2, 4, 8]
param_grid = [
    {
        'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
        'classify__max_depth': [1, 3, 5, 10]
    },
]

# This time, a cached pipeline will be used within the grid search
print("A cached pipeline will be used within the grid search")
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=3)
digits = load_digits()
grid.fit(digits.data, digits.target)



# Delete the temporary cache before exiting
rmtree(cachedir)

weixuanfu on 2 Oct 2017

👍1

Awesome. Let's get this implemented on the dev branch and get to testing it within TPOT. Maybe we can release it as a minor release this week.

rhiever on 2 Oct 2017

Was this page helpful?

0 / 5 - 0 ratings