diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..bc26776 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 6ed389f..7831611 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,19 @@ ## Contributor -- 刘帝伟, 中南大学14级硕士,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). +- 刘帝伟, CSU硕士毕业,关注AI、机器学习、深度学习方向,[HomePage](http://www.csuldw.com). ## Contact +如果有任何疑问,可在我的微信公众号后台留言: + + + +
+ +
+ +或是发邮件吧: + - E-mail: csu.ldw@csu.edu.cn diff --git a/doc/boostingexperiments.pdf b/doc/1996 Experiments with a New Boosting Algorithm.pdf similarity index 100% rename from doc/boostingexperiments.pdf rename to doc/1996 Experiments with a New Boosting Algorithm.pdf diff --git a/draft/stacking.py b/draft/stacking.py new file mode 100644 index 0000000..915d778 --- /dev/null +++ b/draft/stacking.py @@ -0,0 +1,181 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report + +class StackingClassifier(object): + + def __init__(self, modellist=[], meta_classifier=None): + self.modellist = modellist + if meta_classifier == None: + from sklearn.linear_model import LogisticRegression + meta_classifier = LogisticRegression() + self.meta_classifier = meta_classifier + + def SelectModel(self, modelname): + + if modelname == "SVM": + from sklearn.svm import SVC + model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + model = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + model = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + model = RandomForestClassifier() + + elif modelname == "xgboost": + from xgboost import XGBClassifier + model = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + model = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + model = MultinomialNB() + else: + pass + return model + + def get_oof(self, clf, n_folds, X_train, y_train, X_test): + ntrain = X_train.shape[0] + ntest = X_test.shape[0] + print("kfolds: ", ntrain, ntest) + classnum = len(np.unique(y_train)) + kf = KFold(n_splits=n_folds,random_state=1) + oof_train = np.zeros((ntrain,classnum)) + oof_test = np.zeros((ntest,classnum)) + + for i,(train_index, test_index) in enumerate(kf.split(X_train)): + kf_X_train = X_train[train_index] # 数据 + kf_y_train = y_train[train_index] # 标签 + + kf_X_test = X_train[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + oof_train[test_index] = clf.predict_proba(kf_X_test) + # print("shape of oof_train:", oof_train[test_index].shape) + + print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape)) + oof_test += clf.predict_proba(X_test) + oof_test = oof_test/float(n_folds) + print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape)) + return oof_train, oof_test + + def first_layer(self, X_train, y_train, X_test, modellist=None): + """modellist 需要重新修改 + """ + newfeature_list = [] + newtestdata_list = [] + for modelname in self.modellist: + sub_clf = self.SelectModel(modelname) + oof_train_, oof_test_= self.get_oof(clf=sub_clf, + n_folds=5, + X_train=X_train, + y_train=y_train, + X_test=X_test) + print("oof_train: ", oof_train_.shape) + print("model-{}".format(modelname),len(oof_train_), len(oof_test_)) + newfeature_list.append(oof_train_) + print("newfeature_list: ", len(newfeature_list)) + newtestdata_list.append(oof_test_) + + # 特征组合 + X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list) + X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list) + + return X_train_stacking, X_test_stacking + + def fit(self, X_train, y_train, clf=None): + if clf != None: + self.meta_classifier = clf + self.meta_classifier.fit(X_train, y_train) + return self.meta_classifier + + #second_layer + def second_layer(self, X_train, y_train, clf=None): + return self.fit(X_train, y_train, clf) + + def predict(self, X_test, clf=None, type="label"): + if clf == None: + clf = self.meta_classifier + if type == "proba": + return clf.predict_proba(X_test) + elif type == "label": + return clf.predict(X_test) + + def get_accuracy(self, y_true, y_pred): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +# 使用stacking方法的时候 +# 第一级,重构特征当做第二级的训练集 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + # 单纯使用一个分类器的时候 + clf_meta = RandomForestClassifier() + clf_meta.fit(X_train, y_train) + pred = clf_meta.predict(X_test) + accuracy = metrics.accuracy_score(y_test, pred)*100 + print ("====================", accuracy) + # 91.0969793323 + + #layer 1:多模型融合 + modelist = ['SVM', 'GBDT', 'RF', 'KNN'] + stacking_clf = StackingClassifier(modelist) + X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test) + print("shape of X_train_stacking {}".format(X_train_stacking.shape)) + print("shape of X_test_stacking {}".format(X_test_stacking.shape)) + + #layer 2: 单模型训练 + RF = stacking_clf.SelectModel(modelname="RF") + clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF) + pred = stacking_clf.predict(X_test_stacking) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 diff --git a/stacking/stacking.py b/stacking/stacking.py new file mode 100644 index 0000000..4091365 --- /dev/null +++ b/stacking/stacking.py @@ -0,0 +1,241 @@ +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_digits +import numpy as np +from sklearn.svm import SVC +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn import preprocessing +import pandas as pd +from functools import reduce +from sklearn.metrics import confusion_matrix, classification_report +from sklearn.linear_model import LogisticRegression +from sklearn.base import clone +import xgboost as xgb + +class SubClassifier(object): + def __init__(self): + # import lightgbm as lgb + # import xgboost as xgb + # from sklearn.svm import SVC + # from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier + # from sklearn.linear_model import LogisticRegression + # from sklearn.svm import LinearSVC + # clfs = { + # 'lr': LogisticRegression(penalty='l1', C=0.1, tol=0.0001), + # 'svm': LinearSVC(C=0.05, penalty='l2', dual=True), + # 'svm_linear': SVC(kernel='linear', probability=True), + # 'svm_ploy': SVC(kernel='poly', probability=True), + # 'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0, + # random_state=1, n_jobs=1, verbose=1), + # 'rf': RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=9), + # 'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME'), + # 'gbdt': GradientBoostingClassifier(), + # 'xgb': xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50), + # 'lgb': lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.01, max_depth=5, n_estimators=250, num_leaves=90) + # } + pass + + def SelectModel(self, modelname): + if modelname == "SVM": + from sklearn.svm import SVC + clf = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) + + elif modelname == "lr": + from sklearn.linear_model import LogisticRegression + clf = LogisticRegression() + + elif modelname == "GBDT": + from sklearn.ensemble import GradientBoostingClassifier + clf = GradientBoostingClassifier() + + elif modelname == "RF": + from sklearn.ensemble import RandomForestClassifier + clf = RandomForestClassifier(n_estimators=100) + + elif modelname == "xgboost": + from xgboost import XGBClassifier + clf = XGBClassifier( + learning_rate=0.01, + n_estimators=1000, + max_depth=4, + min_child_weight=3, + gamma=0.1, + subsample=0.8, + colsample_bytree=0.8, + reg_alpha=1, + objective='binary:logistic', #multi:softmax + nthread=8, + scale_pos_weight=1, + seed=27, + random_state=27 + ) + elif modelname == "KNN": + from sklearn.neighbors import KNeighborsClassifier as knn + clf = knn() + + elif modelname == "MNB": + from sklearn.naive_bayes import MultinomialNB + clf = MultinomialNB() + else: + pass + return clf + + def performance(self, y_true, y_pred, modelname=""): + accuracy = metrics.accuracy_score(y_true, y_pred)*100 + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("模型{}预测accuracy:{}".format(modelname, accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + + +class StackingClassifier(object): + + def __init__(self, classifiers, meta_classifier, + use_clones=True, n_folds=2, + n_classes=2, random_state=100, + sample_weight=None, use_probas=True): + + self.classifiers = classifiers + self.meta_classifier = meta_classifier + self.use_clones=use_clones + self.n_folds = n_folds + self.n_classes = n_classes + self.random_state = random_state + self.sample_weight = sample_weight + self.use_probas = use_probas + + def cross_valid_oof(self, clf, X, y, n_folds): + """返回CV预测结果 + """ + ntrain = X.shape[0] + n_classes = self.n_classes + random_state = self.random_state + oof_features = np.zeros((ntrain, n_classes)) + oof_pred = np.zeros(ntrain) + kf = KFold(n_splits=n_folds, random_state=random_state) + for i,(train_index, test_index) in enumerate(kf.split(X)): + kf_X_train = X[train_index] # 数据 + kf_y_train = y[train_index] # 标签 + + kf_X_test = X[test_index] # k-fold的验证集 + + clf.fit(kf_X_train, kf_y_train) + if not self.use_probas: + oof_features[test_index] = clf.predict(kf_X_test) + else: + oof_features[test_index] = clf.predict_proba(kf_X_test) + oof_pred[test_index] = clf.predict(kf_X_test) + print("fold-{i}: oof_features: {a}, cv-oof accuracy:{c}".format(i=i, + a=oof_features.shape, + c=self.get_accuracy(y[test_index], oof_pred[test_index]))) + return oof_features + + def fit(self, X, y): + self.clfs_ = self.classifiers + self.meta_clf_ = self.meta_classifier + + n_folds = self.n_folds + sample_weight = self.sample_weight + meta_features = None + + #feature layer + for name, sub_clf in self.clfs_.items(): + print("feature layer, current model: {}".format(name)) + meta_prediction = self.cross_valid_oof(sub_clf, X, y, n_folds) + if meta_features is None: + meta_features = meta_prediction + else: + meta_features = np.column_stack((meta_features, meta_prediction)) + + for name, model in self.clfs_.items(): + print("fit base model using all train set: {}".format(name)) + if sample_weight is None: + model.fit(X, y) + else: + model.fit(X, y, sample_weight=sample_weight) + + #meta layer + if sample_weight is None: + self.meta_clf_.fit(meta_features, y) + else: + self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight) + + return self + + def predict_meta_features(self, X): + """ Get meta-features of test-data. + Parameters + ------- + X : numpy array, shape = [n_samples, n_features] + + Returns: + ------- + meta-features : numpy array, shape = [n_samples, n_classifiers] + """ + per_model_preds = [] + + for name, model in self.clfs_.items(): + print("model {} predict_meta_features".format(name)) + if not self.use_probas: + pred_score = model.predict(X) + else: + pred_score = model.predict_proba(X) + + per_model_preds.append(pred_score) + + return np.hstack(per_model_preds) + + + def predict(self, X): + """ Predict class label for X.""" + meta_features = self.predict_meta_features(X) + return self.meta_clf_.predict(meta_features) + + def predict_prob(self, X): + """ Predict class probabilities for X.""" + meta_features = self.predict_meta_features(X) + return self.meta_clf_.predict_proba(meta_features) + + def get_accuracy(self, y_true, y_pred): + accuracy = round(metrics.accuracy_score(y_true, y_pred)*100,3) + return accuracy + + def performance(self, y_true, y_pred): + accuracy = self.get_accuracy(y_true, y_pred) + confusion = confusion_matrix(y_true, y_pred) + report = classification_report(y_true, y_pred) + print("多模型融合预测accuracy:{}".format(accuracy)) + print("混淆矩阵:\n{}".format(confusion)) + print("预测结果:\n{}".format(report)) + return confusion, report + +# 使用stacking方法的时候 +if __name__ == "__main__": + # 导入数据集切割训练与测试数据 + data = load_digits() + data_D = preprocessing.StandardScaler().fit_transform(data.data) + data_L = data.target + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) + print(set(y_train)) + + #layer 1:多模型融合 + classifiers = { + 'KNN': SubClassifier().SelectModel(modelname="KNN"), + 'rf': SubClassifier().SelectModel(modelname="RF"), + 'svm': SubClassifier().SelectModel(modelname="SVM"), + 'GBDT': SubClassifier().SelectModel(modelname="GBDT") + } + + meta_classifier = SubClassifier().SelectModel(modelname="RF") + + stacking_clf = StackingClassifier(classifiers, meta_classifier, n_classes=10,n_folds=5) + + stacking_clf.fit(X_train, y_train) + pred = stacking_clf.predict(X_test) + + #模型评估 + stacking_clf.performance(y_test, pred) + # 96.4228934817 \ No newline at end of file