scikit-learn now has Pipeline memory that allows intermediate transformations to be cached. We should add this as an option to TPOT as another way to speed up the pipeline optimization process.
This feature will probably help speed up the TPOT-MDR configuration as well.
Need to test if pipeline caching works with:
The demo below is for testing both stacking and combine op:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import FunctionTransformer
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from copy import copy
from tpot.builtins import StackingEstimator
# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
cached_pipe = make_pipeline(
make_union(
StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
FunctionTransformer(copy)
),
LinearSVC(),
memory=memory
)
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 5, 10, 100, 1000]
param_grid = [
{
'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
'linearsvc__C': C_OPTIONS
},
]
# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)
cached_pipe_2 = make_pipeline(
make_union(
StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=500)),
FunctionTransformer(copy)
),
ExtraTreesClassifier(n_estimators=100),
memory=memory
)
param_grid = [
{
'featureunion__stackingestimator__estimator__max_features': N_FEATURES_OPTIONS,
'extratreesclassifier__max_depth': [1, 3, 5, 10]
},
]
# This time, a cached pipeline will be used within the grid search
print("A cached pipeline will be used within the grid search")
grid = GridSearchCV(cached_pipe_2, cv=3, n_jobs=1, param_grid=param_grid, verbose=2)
digits = load_digits()
grid.fit(digits.data, digits.target)
# Delete the temporary cache before exiting
rmtree(cachedir)
Relief works too!
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2, RFE
from skrebate import ReliefF
from sklearn.ensemble import ExtraTreesClassifier
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
cached_pipe = Pipeline([('reduce_dim', ReliefF()),
('classify', LinearSVC())],
memory=memory)
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 5, 10, 100, 1000]
param_grid = [
{
'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
]
# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=3)
digits = load_digits()
grid.fit(digits.data, digits.target)
cached_pipe = Pipeline([('reduce_dim', ReliefF()),
('classify', ExtraTreesClassifier(n_estimators=100))],
memory=memory)
N_FEATURES_OPTIONS = [2, 4, 8]
param_grid = [
{
'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
'classify__max_depth': [1, 3, 5, 10]
},
]
# This time, a cached pipeline will be used within the grid search
print("A cached pipeline will be used within the grid search")
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=3)
digits = load_digits()
grid.fit(digits.data, digits.target)
# Delete the temporary cache before exiting
rmtree(cachedir)
Awesome. Let's get this implemented on the dev branch and get to testing it within TPOT. Maybe we can release it as a minor release this week.
Most helpful comment
The demo below is for testing both stacking and combine op: