From e362502dd8b20e47326d3d45c321ddfebf24be14 Mon Sep 17 00:00:00 2001 From: tsterbak Date: Mon, 18 Apr 2016 09:34:12 +0200 Subject: [PATCH 1/2] Add stacking-meta-model A wrapper allowing to combine models in a two stage stacking model. --- sklearn/ensemble/stacking_model.py | 170 +++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 sklearn/ensemble/stacking_model.py diff --git a/sklearn/ensemble/stacking_model.py b/sklearn/ensemble/stacking_model.py new file mode 100644 index 0000000000000..f4087e9d54b88 --- /dev/null +++ b/sklearn/ensemble/stacking_model.py @@ -0,0 +1,170 @@ +# libraries +import numpy as np + +# scikit-learn base libraries +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin + +# scikit-learn modules +from sklearn.cross_validation import cross_val_predict +from sklearn.ensemble import VotingClassifier + + +class StackingClassifier(BaseEstimator,ClassifierMixin): + ''' + stacking ensemble classifier based on scikit-learn + ''' + def __init__(self,stage_one_clfs,stage_two_clfs,weights=None, n_runs=10): + ''' + + weights: weights of the stage_two_clfs + n_runs: train stage_two_clfs n_runs times and average them (only for probabilistic output) + ''' + self.stage_one_clfs = stage_one_clfs + self.stage_two_clfs = stage_two_clfs + if weights == None: + self.weights = [1] * len(stage_two_clfs) + else: + self.weights = weights + self.n_runs = n_runs + + def fit(self,X,y): + ''' + fit the model + ''' + self.__X = X + self.__y = y + + # fit the first stage models + for clf in self.stage_one_clfs: + y_pred = cross_val_predict(clf, X, y, cv=5, n_jobs=1) + clf.fit(X,y) + y_pred = np.reshape(y_pred,(len(y_pred),1)) + self.__X = np.hstack((self.__X,y_pred)) + + # fit the second stage models + for clf in self.stage_two_clfs: + clf.fit(self.__X,self.__y) + + def predict(self,X_test): + ''' + predict the class for each sample + ''' + self.__X_test = X_test + + # first stage + for clf in self.stage_one_clfs: + y_pred = clf.predict(X_test) + y_pred = np.reshape(y_pred,(len(y_pred),1)) + self.__X_test = np.hstack((self.__X_test,y_pred)) + + # second stage + preds = [] + for i in range(self.n_runs): + j = 0 + for clf in self.stage_two_clfs: + y_pred = clf.predict(self.__X_test) + preds.append(self.weights[j] * y_pred) + j += 1 + # average predictions + y_final = preds.pop(0) + for pred in preds: + y_final += pred + y_out = y_final/(np.array(self.weights).sum() * self.n_runs) + + return y_out + + + def predict_proba(self,X_test): + ''' + predict the probability for each class for each sample + ''' + self.__X_test = X_test + + # first stage + for clf in self.stage_one_clfs: + y_pred = clf.predict(X_test) + y_pred = np.reshape(y_pred,(len(y_pred),1)) + self.__X_test = np.hstack((self.__X_test,y_pred)) + + # second stage + majority_voting = VotingClassifier(estimators=self.stage_two_clfs, voting="hard", weights=self.weights) + y_out = majority_voting.predict(self.__X_test) + + return y_out + + def run_gridsearch(self,X,y,params): + ''' + + ''' + print("Not implemented yet") + + +class StackingRegressor(BaseEstimator,RegressorMixin): + ''' + stacking ensemble regressor based on scikit-learn + ''' + def __init__(self,stage_one_clfs,stage_two_clfs,weights=None, n_runs=10): + ''' + + weights: weights of the stage_two_clfs + n_runs: train stage_two_clfs n_runs times and average them + ''' + self.stage_one_clfs = stage_one_clfs + self.stage_two_clfs = stage_two_clfs + if weights == None: + self.weights = [1] * len(stage_two_clfs) + else: + self.weights = weights + self.n_runs = n_runs + + def fit(self,X,y): + ''' + fit the model + ''' + self.__X = X + self.__y = y + + # fit the first stage models + for clf in self.stage_one_clfs: + y_pred = cross_val_predict(clf, X, y, cv=5, n_jobs=1) + clf.fit(X,y) + y_pred = np.reshape(y_pred,(len(y_pred),1)) + self.__X = np.hstack((self.__X,y_pred)) + + # fit the second stage models + for clf in self.stage_two_clfs: + clf.fit(self.__X,self.__y) + + def predict(self,X_test): + ''' + Predict the value for each sample + ''' + self.__X_test = X_test + + # first stage + for clf in self.stage_one_clfs: + y_pred = clf.predict(X_test) + y_pred = np.reshape(y_pred,(len(y_pred),1)) + self.X_test = np.hstack((self.__X_test,y_pred)) + + # second stage + preds = [] + for i in range(self.n_runs): + j = 0 + for clf in self.stage_two_clfs: + y_pred = clf.predict(self.__X_test) + preds.append(self.weights[j] * y_pred) + j += 1 + # average predictions + y_final = preds.pop(0) + for pred in preds: + y_final += pred + y_final = y_final/(np.array(self.weights).sum() * self.n_runs) + + return y_final + + def run_gridsearch(self,X,y,params): + ''' + + ''' + print("Not implemented yet") \ No newline at end of file From f770b02dd40e305a90e1075beacece5a43958904 Mon Sep 17 00:00:00 2001 From: tsterbak Date: Mon, 18 Apr 2016 09:54:34 +0200 Subject: [PATCH 2/2] change the predict/predict_proba method --- sklearn/ensemble/stacking_model.py | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/stacking_model.py b/sklearn/ensemble/stacking_model.py index f4087e9d54b88..28f6c9023701d 100644 --- a/sklearn/ensemble/stacking_model.py +++ b/sklearn/ensemble/stacking_model.py @@ -58,18 +58,11 @@ def predict(self,X_test): self.__X_test = np.hstack((self.__X_test,y_pred)) # second stage - preds = [] - for i in range(self.n_runs): - j = 0 - for clf in self.stage_two_clfs: - y_pred = clf.predict(self.__X_test) - preds.append(self.weights[j] * y_pred) - j += 1 - # average predictions - y_final = preds.pop(0) - for pred in preds: - y_final += pred - y_out = y_final/(np.array(self.weights).sum() * self.n_runs) + est = [] + for clf in self.stage_two_clfs: + est.append(("clf",clf)) + majority_voting = VotingClassifier(estimators=est, voting="hard", weights=self.weights) + y_out = majority_voting.predict(self.__X_test) return y_out @@ -87,8 +80,18 @@ def predict_proba(self,X_test): self.__X_test = np.hstack((self.__X_test,y_pred)) # second stage - majority_voting = VotingClassifier(estimators=self.stage_two_clfs, voting="hard", weights=self.weights) - y_out = majority_voting.predict(self.__X_test) + preds = [] + for i in range(self.n_runs): + j = 0 + for clf in self.stage_two_clfs: + y_pred = clf.predict_proba(self.__X_test) + preds.append(self.weights[j] * y_pred) + j += 1 + # average predictions + y_final = preds.pop(0) + for pred in preds: + y_final += pred + y_out = y_final/(np.array(self.weights).sum() * self.n_runs) return y_out