-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Add stacking-meta-model #6674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add stacking-meta-model #6674
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
# libraries | ||
import numpy as np | ||
|
||
# scikit-learn base libraries | ||
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin | ||
|
||
# scikit-learn modules | ||
from sklearn.cross_validation import cross_val_predict | ||
from sklearn.ensemble import VotingClassifier | ||
|
||
|
||
class StackingClassifier(BaseEstimator,ClassifierMixin): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space after comma. In general, run PEP8 on your code |
||
''' | ||
stacking ensemble classifier based on scikit-learn | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add better documentation, Look at |
||
''' | ||
def __init__(self,stage_one_clfs,stage_two_clfs,weights=None, n_runs=10): | ||
''' | ||
|
||
weights: weights of the stage_two_clfs | ||
n_runs: train stage_two_clfs n_runs times and average them (only for probabilistic output) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do add better documentation for the parameters |
||
''' | ||
self.stage_one_clfs = stage_one_clfs | ||
self.stage_two_clfs = stage_two_clfs | ||
if weights == None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Logic such as this should be moved out of the class instantiation |
||
self.weights = [1] * len(stage_two_clfs) | ||
else: | ||
self.weights = weights | ||
self.n_runs = n_runs | ||
|
||
def fit(self,X,y): | ||
''' | ||
fit the model | ||
''' | ||
self.__X = X | ||
self.__y = y | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to store these as attributes |
||
|
||
# fit the first stage models | ||
for clf in self.stage_one_clfs: | ||
y_pred = cross_val_predict(clf, X, y, cv=5, n_jobs=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need labels here (hard voting) or the probabilistic values (soft voting) in practice? Using labels seem odd to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I'm not clear if we have to use Again reference: http://machine-learning.martinsewell.com/ensembles/stacking/ Maybe @jnothman has some better idea on how to handle this in terms of API? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @MechCoder I'm not sure if the paper you provide is the only (and the best) way to deal with training set partitioning for stacking. Using CV for stage one doesn't seem to create independence issues, since each output from stage one is legit (because CV), and then fed to stage two. Using the method described in the paper you reference sounds like doing a 2-fold CV, but stopping en-route (doing just the first fold of the CV). Except if steps 1 through 4 are meant to be looped over n times, in which case it becomes CV as performed by In short, using CV (manually or via References: |
||
clf.fit(X,y) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should clone this and make the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
y_pred = np.reshape(y_pred,(len(y_pred),1)) | ||
self.__X = np.hstack((self.__X,y_pred)) | ||
|
||
# fit the second stage models | ||
for clf in self.stage_two_clfs: | ||
clf.fit(self.__X,self.__y) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be just |
||
|
||
def predict(self,X_test): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
''' | ||
predict the class for each sample | ||
''' | ||
self.__X_test = X_test | ||
|
||
# first stage | ||
for clf in self.stage_one_clfs: | ||
y_pred = clf.predict(X_test) | ||
y_pred = np.reshape(y_pred,(len(y_pred),1)) | ||
self.__X_test = np.hstack((self.__X_test,y_pred)) | ||
|
||
# second stage | ||
est = [] | ||
for clf in self.stage_two_clfs: | ||
est.append(("clf",clf)) | ||
majority_voting = VotingClassifier(estimators=est, voting="hard", weights=self.weights) | ||
y_out = majority_voting.predict(self.__X_test) | ||
|
||
return y_out | ||
|
||
|
||
def predict_proba(self,X_test): | ||
''' | ||
predict the probability for each class for each sample | ||
''' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should raise an error if the second stage classifier does not support |
||
self.__X_test = X_test | ||
|
||
# first stage | ||
for clf in self.stage_one_clfs: | ||
y_pred = clf.predict(X_test) | ||
y_pred = np.reshape(y_pred,(len(y_pred),1)) | ||
self.__X_test = np.hstack((self.__X_test,y_pred)) | ||
|
||
# second stage | ||
preds = [] | ||
for i in range(self.n_runs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not clear to me what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For second stage averaging (thats what i'm doing) I learned this kind of trick from kaggle. |
||
j = 0 | ||
for clf in self.stage_two_clfs: | ||
y_pred = clf.predict_proba(self.__X_test) | ||
preds.append(self.weights[j] * y_pred) | ||
j += 1 | ||
# average predictions | ||
y_final = preds.pop(0) | ||
for pred in preds: | ||
y_final += pred | ||
y_out = y_final/(np.array(self.weights).sum() * self.n_runs) | ||
|
||
return y_out | ||
|
||
def run_gridsearch(self,X,y,params): | ||
''' | ||
|
||
''' | ||
print("Not implemented yet") | ||
|
||
|
||
class StackingRegressor(BaseEstimator,RegressorMixin): | ||
''' | ||
stacking ensemble regressor based on scikit-learn | ||
''' | ||
def __init__(self,stage_one_clfs,stage_two_clfs,weights=None, n_runs=10): | ||
''' | ||
|
||
weights: weights of the stage_two_clfs | ||
n_runs: train stage_two_clfs n_runs times and average them | ||
''' | ||
self.stage_one_clfs = stage_one_clfs | ||
self.stage_two_clfs = stage_two_clfs | ||
if weights == None: | ||
self.weights = [1] * len(stage_two_clfs) | ||
else: | ||
self.weights = weights | ||
self.n_runs = n_runs | ||
|
||
def fit(self,X,y): | ||
''' | ||
fit the model | ||
''' | ||
self.__X = X | ||
self.__y = y | ||
|
||
# fit the first stage models | ||
for clf in self.stage_one_clfs: | ||
y_pred = cross_val_predict(clf, X, y, cv=5, n_jobs=1) | ||
clf.fit(X,y) | ||
y_pred = np.reshape(y_pred,(len(y_pred),1)) | ||
self.__X = np.hstack((self.__X,y_pred)) | ||
|
||
# fit the second stage models | ||
for clf in self.stage_two_clfs: | ||
clf.fit(self.__X,self.__y) | ||
|
||
def predict(self,X_test): | ||
''' | ||
Predict the value for each sample | ||
''' | ||
self.__X_test = X_test | ||
|
||
# first stage | ||
for clf in self.stage_one_clfs: | ||
y_pred = clf.predict(X_test) | ||
y_pred = np.reshape(y_pred,(len(y_pred),1)) | ||
self.X_test = np.hstack((self.__X_test,y_pred)) | ||
|
||
# second stage | ||
preds = [] | ||
for i in range(self.n_runs): | ||
j = 0 | ||
for clf in self.stage_two_clfs: | ||
y_pred = clf.predict(self.__X_test) | ||
preds.append(self.weights[j] * y_pred) | ||
j += 1 | ||
# average predictions | ||
y_final = preds.pop(0) | ||
for pred in preds: | ||
y_final += pred | ||
y_final = y_final/(np.array(self.weights).sum() * self.n_runs) | ||
|
||
return y_final | ||
|
||
def run_gridsearch(self,X,y,params): | ||
''' | ||
|
||
''' | ||
print("Not implemented yet") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove these comments