From 21ef8ba96de935afdcdd7600aa279f78aa1c7414 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Thu, 30 Jul 2015 00:24:46 +0200 Subject: [PATCH 1/9] Initial gp-based hyperopt code --- sklearn/gp_search.py | 262 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 sklearn/gp_search.py diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py new file mode 100644 index 0000000000000..8498890bdd6cc --- /dev/null +++ b/sklearn/gp_search.py @@ -0,0 +1,262 @@ +""" +The :mod:`sklearn.gp_search` includes utilities to fine-tune the parameters +of an estimator through a Gaussian Process model. +""" +from __future__ import print_function + +# Author: Sebastien Dubois , +# License: BSD 3 clause + +import numpy as np + +from .gaussian_process.gaussian_process import GaussianProcess +from .cross_validation import check_cv +from .cross_validation import _fit_and_score +from .metrics.scorer import check_scoring +from .ensemble import RandomForestClassifier +from .base import is_classifier, clone + +##################### UTILS ##################### + +def sample_candidates(n_candidates,param_bounds,param_isInt): + n_parameters = param_isInt.shape[0] + candidates = [] + + for k in range(n_parameters): + if(param_isInt[k]): + k_sample = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] , + dtype = np.int32) + else: + k_sample = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] ) + candidates.append(k_sample) + + candidates = np.asarray(candidates) + candidates = candidates.T + + return compute_unique(candidates) + +def compute_unique(a): + # keep only unique values in the ndarray a + # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array + + b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) + _, idx = np.unique(b, return_index=True) + idx =np.sort(idx) + + return a[idx] + +def is_in_ndarray(item,a): + # look for element item in ndarray a + # returns True if item is in a, and its index + + k = 0 + idx_val = np.asarray(range(a.shape[0])) + idxk = range(a.shape[0]) + while( k < a.shape[1]): + idxk = (a[idxk,k]==item[k]) + if(np.sum(idxk > 0)): + k += 1 + idx_val = idx_val[idxk] + idxk = list(idx_val) + else: + return False,0 + + return True,idx_val[0] + + + +##################### GPSearchCV ##################### + +class GPSearchCV(object): + """ + Examples + -------- + >>> parameters = {'kernel' : ['rbf','poly'], + ... 'd' : [1,3], + ... 'C' : [1,10] } + >>> parameters_details = {'kernel' : 'cat', + ... 'd' : 'int', + ... 'C' : 'float'} + + """ + + def __init__(self, + parameters, + parameters_details, + estimator, scoring=None, + X=None,y=None, + fit_params=None, + refit=True, + cv=None, + acquisition_function = 'UCB', + n_iter=100, + n_init=10, + n_candidates = 500, + gp_nugget=1.e-10, + verbose=True): + + self.parameters = parameters + self.n_parameters = len(parameters) + self.parameters_details = parameters_details + self.acquisition_function = acquisition_function + self.n_iter = n_iter + self.n_init = n_init + self.n_candidates = n_candidates + self.param_names = parameters.keys() + self.param_isInt = np.array([ 0 if (parameters_details[k]=='float') else 1 for k in self.param_names ]) + self.param_bounds = np.zeros((self.n_parameters,2)) + self.gp_nugget = gp_nugget + self.verbose = verbose + self.scoring = scoring + self.estimator = estimator + self.fit_params = fit_params if fit_params is not None else {} + self.cv = cv + self.X = X + self.y = y + + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) + + # init param_bounds + for i in range(self.n_parameters): + if(parameters_details[self.param_names[i]]=='cat'): + self.param_bounds[i,0] = 0 + self.param_bounds[i,1] = len(parameters[self.param_names[i]]) + else: + self.param_bounds[i] = np.array(parameters[self.param_names[i]]) + if(parameters_details[self.param_names[i]]=='int'): + self.param_bounds[i,1] += 1 + + if(self.verbose): + print(self.parameters) + print(self.parameters_details) + print(self.param_names) + print(self.param_isInt) + print(self.param_bounds) + + + def vector_to_dict(self,vector_parameter): + dict_parameter = dict.fromkeys(self.param_names) + for i in range(self.n_parameters): + if(self.parameters_details[self.param_names[i]]=='cat'): + dict_parameter[self.param_names[i]] = self.parameters[self.param_names[i]][int(vector_parameter[i])] + elif(self.parameters_details[self.param_names[i]]=='int'): + dict_parameter[self.param_names[i]] = int(vector_parameter[i]) + else: + dict_parameter[self.param_names[i]] = vector_parameter[i] + + return dict_parameter + + def score(self,test_parameter): + cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) + cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, + train, test, False, test_parameter, + self.fit_params, return_parameters=True) + for train, test in cv ] + + n_test_samples = 0 + score = 0 + for tmp_score, tmp_n_test_samples, _, _ in cv_score: + tmp_score *= tmp_n_test_samples + n_test_samples += tmp_n_test_samples + score += tmp_score + score /= float(n_test_samples) + + return score + + + def fit(self): + + n_tested_parameters = 0 + tested_parameters = np.zeros((self.n_iter,self.n_parameters)) + cv_scores = np.zeros(self.n_iter) + + ### Initialize with random candidates ### + init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt) + + for i in range(self.n_init): + cand = self.vector_to_dict(init_candidates[i,:]) + cv_score = self.score(cand) + + if(self.verbose): + print ('Step ' + str(i) + ' - Hyperparameter ' + str(init_candidates[i,:]) + ' ' + str(cv_score)) + + is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:]) + if not is_in: + tested_parameters[n_tested_parameters,:] = init_candidates[i,:] + cv_scores[n_tested_parameters] = cv_score + n_tested_parameters += 1 + else: + if(verbose): + print('Hyperparameter already tesed') + cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. + + + for i in range(self.n_iter-self.n_init): + + # Model with a Gaussian Process + gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) , + thetaL = 0.001 * np.ones(self.n_parameters) , + thetaU = 10. * np.ones(self.n_parameters) , + nugget= self.gp_nugget) + gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters]) + + # Sample candidates and predict their corresponding acquisition values + candidates = sample_candidates(self.n_candidates,self.param_bounds,self.param_isInt) + if(self.acquisition_function == 'UCB'): + predictions,MSE = gp.predict(candidates,eval_MSE=True) + upperBound = predictions + 1.96*np.sqrt(MSE) + best_candidate = candidates[np.argmax(upperBound)] + + else: + print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function) + + cv_score = self.score(self.vector_to_dict(best_candidate)) + if(self.verbose): + print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(best_candidate) + ' ' + str(cv_score)) + + is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:]) + if not is_in: + tested_parameters[n_tested_parameters,:] = best_candidate + cv_scores[n_tested_parameters] = cv_score + n_tested_parameters += 1 + else: + if(verbose): + print('Hyperparameter already tesed') + cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. + + best_idx = np.argmax(cv_scores[:n_tested_parameters]) + vector_best_param = tested_parameters[best_idx] + best_parameter = self.vector_to_dict(vector_best_param) + + if(self.verbose): + print ('\nTested ' + str(n_tested_parameters) + ' parameters') + print ('Max cv score ' + str(cv_scores[best_idx])) + print ('Best parameter ' + str(tested_parameters[best_idx])) + print(best_parameter) + + +def test(): + from sklearn.datasets import load_digits + iris = load_digits() + X, y = iris.data, iris.target + clf = RandomForestClassifier(n_estimators=20) + + # specify parameters and distributions to sample from + parameters = {"max_depth": [3, 3], + "max_features": [1,11], + "min_samples_split": [1,11], + "min_samples_leaf": [1,11], + "bootstrap": [True, False], + "criterion": ["gini", "entropy"]} + + parameters_details = {"max_depth": 'int', + "max_features": 'int', + "min_samples_split": 'int', + "min_samples_leaf": 'int', + "bootstrap": 'cat', + "criterion": 'cat'} + + search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20) + search.fit() + +test() \ No newline at end of file From 37ebc2621206cde522a99d36c75939703bf33446 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Thu, 30 Jul 2015 15:11:01 +0200 Subject: [PATCH 2/9] GPSearchCV handles both sklearn Estimators and custom score functions --- sklearn/gp_search.py | 86 ++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index 8498890bdd6cc..3f5d54c2438a4 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -13,7 +13,6 @@ from .cross_validation import check_cv from .cross_validation import _fit_and_score from .metrics.scorer import check_scoring -from .ensemble import RandomForestClassifier from .base import is_classifier, clone ##################### UTILS ##################### @@ -83,7 +82,8 @@ class GPSearchCV(object): def __init__(self, parameters, parameters_details, - estimator, scoring=None, + estimator, + scoring=None, X=None,y=None, fit_params=None, refit=True, @@ -114,7 +114,16 @@ def __init__(self, self.X = X self.y = y - self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) + if(callable(estimator)): + self._callable_estimator = True + if(verbose): + print('Estimator is a callable and not an sklearn Estimator') + else: + self._callable_estimator = False + + + if not self._callable_estimator: + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) # init param_bounds for i in range(self.n_parameters): @@ -147,24 +156,28 @@ def vector_to_dict(self,vector_parameter): return dict_parameter def score(self,test_parameter): - cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) - cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, - train, test, False, test_parameter, - self.fit_params, return_parameters=True) - for train, test in cv ] - - n_test_samples = 0 - score = 0 - for tmp_score, tmp_n_test_samples, _, _ in cv_score: - tmp_score *= tmp_n_test_samples - n_test_samples += tmp_n_test_samples - score += tmp_score - score /= float(n_test_samples) + if not self._callable_estimator: + cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) + cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, + train, test, False, test_parameter, + self.fit_params, return_parameters=True) + for train, test in cv ] + + n_test_samples = 0 + score = 0 + for tmp_score, tmp_n_test_samples, _, _ in cv_score: + tmp_score *= tmp_n_test_samples + n_test_samples += tmp_n_test_samples + score += tmp_score + score /= float(n_test_samples) + + else: + score = self.estimator(test_parameter) return score - def fit(self): + def _fit(self): n_tested_parameters = 0 tested_parameters = np.zeros((self.n_iter,self.n_parameters)) @@ -172,13 +185,14 @@ def fit(self): ### Initialize with random candidates ### init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt) + self.n_init = init_candidates.shape[0] for i in range(self.n_init): - cand = self.vector_to_dict(init_candidates[i,:]) - cv_score = self.score(cand) + dict_candidate = self.vector_to_dict(init_candidates[i,:]) + cv_score = self.score(dict_candidate) if(self.verbose): - print ('Step ' + str(i) + ' - Hyperparameter ' + str(init_candidates[i,:]) + ' ' + str(cv_score)) + print ('Step ' + str(i) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score)) is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:]) if not is_in: @@ -210,9 +224,10 @@ def fit(self): else: print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function) - cv_score = self.score(self.vector_to_dict(best_candidate)) + dict_candidate = self.vector_to_dict(best_candidate) + cv_score = self.score(dict_candidate) if(self.verbose): - print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(best_candidate) + ' ' + str(cv_score)) + print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score)) is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:]) if not is_in: @@ -234,29 +249,4 @@ def fit(self): print ('Best parameter ' + str(tested_parameters[best_idx])) print(best_parameter) - -def test(): - from sklearn.datasets import load_digits - iris = load_digits() - X, y = iris.data, iris.target - clf = RandomForestClassifier(n_estimators=20) - - # specify parameters and distributions to sample from - parameters = {"max_depth": [3, 3], - "max_features": [1,11], - "min_samples_split": [1,11], - "min_samples_leaf": [1,11], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]} - - parameters_details = {"max_depth": 'int', - "max_features": 'int', - "min_samples_split": 'int', - "min_samples_leaf": 'int', - "bootstrap": 'cat', - "criterion": 'cat'} - - search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20) - search.fit() - -test() \ No newline at end of file + return tested_parameters[:n_tested_parameters,:], cv_scores[:n_tested_parameters] \ No newline at end of file From e7677b6690ffda4a20588cb3a8ce15a238a4e0f7 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Thu, 30 Jul 2015 15:15:08 +0200 Subject: [PATCH 3/9] FIX typo --- sklearn/gp_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index 3f5d54c2438a4..44e1c201e2303 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -200,7 +200,7 @@ def _fit(self): cv_scores[n_tested_parameters] = cv_score n_tested_parameters += 1 else: - if(verbose): + if(self.verbose): print('Hyperparameter already tesed') cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. @@ -235,7 +235,7 @@ def _fit(self): cv_scores[n_tested_parameters] = cv_score n_tested_parameters += 1 else: - if(verbose): + if(self.verbose): print('Hyperparameter already tesed') cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. From 2a043a44618e1d898d084c99bc106fc12c6203b3 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Thu, 30 Jul 2015 19:02:52 +0200 Subject: [PATCH 4/9] Adding example for GPSearchCV --- examples/model_selection/gp_search.py | 252 ++++++++++++++++++++++++++ sklearn/gp_search.py | 1 + 2 files changed, 253 insertions(+) create mode 100644 examples/model_selection/gp_search.py diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py new file mode 100644 index 0000000000000..72ff41c280df7 --- /dev/null +++ b/examples/model_selection/gp_search.py @@ -0,0 +1,252 @@ +from sklearn.datasets import load_digits +from sklearn.gp_search import GPSearchCV +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.linear_model import SGDClassifier +from sklearn.grid_search import GridSearchCV, RandomizedSearchCV +from sklearn.pipeline import Pipeline + +import logging +import matplotlib.pyplot as plt +import numpy as np + +def extend_result(n_tests,tmp_res): + res = np.zeros(n_tests) + l = len(tmp_res) -1 + for i in range(n_tests): + res[i] = tmp_res[min(i,l)] + + return res + + +def test1(): + iris = load_digits() + X, y = iris.data, iris.target + clf = RandomForestClassifier(n_estimators=20) + + # specify parameters and distributions to sample from + parameters = {"max_depth": [3, 3], + "max_features": [1,11], + "min_samples_split": [1,11], + "min_samples_leaf": [1,11], + "bootstrap": [True, False], + "criterion": ["gini", "entropy"]} + + parameters_details = {"max_depth": 'int', + "max_features": 'int', + "min_samples_split": 'int', + "min_samples_leaf": 'int', + "bootstrap": 'cat', + "criterion": 'cat'} + + search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20) + search._fit() + + +def test2(): + parameters = {'kernel' : ['rbf','poly'],'d' : [1,3],'C' : [1,10] } + parameters_details = {'kernel' : 'cat','d' : 'int','C' : 'float'} + def scoring_function(x): + return 0.5 + + search = GPSearchCV(parameters,parameters_details,estimator=scoring_function,n_iter=20) + search._fit() + + +def test3(): + # Display progress logs on stdout + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') + + # Load some categories from the training set + categories = [ + 'alt.atheism', + 'talk.religion.misc', + ] + # Uncomment the following to do the analysis on all the categories + #categories = None + + print("Loading 20 newsgroups dataset for categories:") + print(categories) + + data = fetch_20newsgroups(subset='train', categories=categories) + print("%d documents" % len(data.filenames)) + print("%d categories" % len(data.target_names)) + print() + + # define a pipeline combining a text feature extractor with a simple + # classifier + pipeline = Pipeline([ + ('vect', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('clf', SGDClassifier()), + ]) + + # uncommenting more parameters will give better exploring power but will + # increase processing time in a combinatorial way + parameters = { + 'vect__max_df': [0.5,1.], + #'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': [(1, 1), (1, 2)], # unigrams or bigrams + #'tfidf__use_idf': (True, False), + #'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': [0.000001, 0.00001], + 'clf__penalty': ['l2', 'elasticnet'], + #'clf__n_iter': (10, 50, 80), + } + + parameters_details = { + 'vect__max_df': 'float', + #'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': 'cat' , # unigrams or bigrams + #'tfidf__use_idf': (True, False), + #'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': 'float', + 'clf__penalty': 'cat', + #'clf__n_iter': (10, 50, 80), + } + + search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=data.data, y=data.target,n_iter=20) + search._fit() + + +def gp_vs_random_search(test_name,n_tests,search_lenght): + """ + Compare GP-based search vs a simple random one + Choose test_name in {'iris','text'} + """ + + n_iter_search = search_lenght + + if(test_name == 'iris'): + iris = load_digits() + X, y = iris.data, iris.target + pipeline = RandomForestClassifier(n_estimators=20) + + # specify parameters and distributions to sample from + parameters = {"max_depth": [2, 15], + "max_features": [1,20], + "min_samples_split": [1,20], + "min_samples_leaf": [1,20], + "bootstrap": [True, False], + "criterion": ["gini", "entropy"]} + + parameters_details = {"max_depth": 'int', + "max_features": 'int', + "min_samples_split": 'int', + "min_samples_leaf": 'int', + "bootstrap": 'cat', + "criterion": 'cat'} + + elif(test_name == 'text'): + # Display progress logs on stdout + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') + + # Load some categories from the training set + categories = [ + 'alt.atheism', + 'talk.religion.misc', + ] + # Uncomment the following to do the analysis on all the categories + #categories = None + print("Loading 20 newsgroups dataset for categories:") + print(categories) + + data = fetch_20newsgroups(subset='train', categories=categories) + print("%d documents" % len(data.filenames)) + print("%d categories" % len(data.target_names)) + + X = data.data + y = data.target + + # define a pipeline combining a text feature extractor with a simple + # classifier + pipeline = Pipeline([ + ('vect', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('clf', SGDClassifier()), + ]) + + # uncommenting more parameters will give better exploring power but will + # increase processing time in a combinatorial way + parameters = { + 'vect__max_df': [0.5,1.], + #'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': [(1, 1), (1, 2)], # unigrams or bigrams + #'tfidf__use_idf': (True, False), + #'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': [0.000001, 0.00001], + 'clf__penalty': ['l2', 'elasticnet'], + #'clf__n_iter': (10, 50, 80), + } + + parameters_details = { + 'vect__max_df': 'float', + #'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': 'cat' , # unigrams or bigrams + #'tfidf__use_idf': (True, False), + #'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': 'float', + 'clf__penalty': 'cat', + #'clf__n_iter': (10, 50, 80), + } + + else: + print('Dataset not available for test') + + # GP search + all_gp_results = [] + print 'GP search' + for i in range(n_tests): + search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y, + n_iter=n_iter_search, n_init=20, verbose=False) + _,scores = search._fit() + + max_scores = [scores[0]] + print 'Test',i,'-',len(scores),'parameters tested' + + for j in range(1,len(scores)): + max_scores.append(max(max_scores[j-1],scores[j])) + all_gp_results.append(extend_result(n_iter_search,max_scores)) + all_gp_results = np.asarray(all_gp_results) + print all_gp_results.shape + + # Randomized search + print 'Random search' + all_random_results = [] + for i in range(n_tests): + random_search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y, + n_iter=n_iter_search, n_init=n_iter_search, verbose=False) + _,scores = search._fit() + + max_scores = [scores[0]] + print 'Test',i,'-',len(scores),'parameters tested' + + for j in range(1,len(scores)): + max_scores.append(max(max_scores[j-1],scores[j])) + all_random_results.append(extend_result(n_iter_search,max_scores)) + all_random_results = np.asarray(all_random_results) + + plt.figure() + plt.plot(range(n_iter_search),np.mean(all_gp_results,axis=0),'r',label='GP') + plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random') + plt.legend() + plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials') + plt.xlabel('Iterations') + plt.ylabel('Max CV performance') + plt.show() + + + +if __name__ == "__main__": + + # print 'Routine Test' + # test2() + test_name = 'iris' + n_tests = 20 + search_lenght = 50 + print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials' + gp_vs_random_search(test_name,n_tests,search_lenght) \ No newline at end of file diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index 44e1c201e2303..28c70876eaba4 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -211,6 +211,7 @@ def _fit(self): gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) , thetaL = 0.001 * np.ones(self.n_parameters) , thetaU = 10. * np.ones(self.n_parameters) , + random_start = 3, nugget= self.gp_nugget) gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters]) From 69b56f49d49e34612b8e970781276bc59d9e5ba6 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Fri, 7 Aug 2015 17:53:33 +0200 Subject: [PATCH 5/9] Added EI and changed parameter descriptions --- sklearn/gp_search.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index 28c70876eaba4..505f3c21ee31a 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -8,6 +8,7 @@ # License: BSD 3 clause import numpy as np +from scipy.stats import norm from .gaussian_process.gaussian_process import GaussianProcess from .cross_validation import check_cv @@ -34,6 +35,13 @@ def sample_candidates(n_candidates,param_bounds,param_isInt): return compute_unique(candidates) +def compute_ei(predictions,sigma,y_best): + ei_array = np.zeros(predictions.shape[0]) + for i in range(ei_array.shape[0]): + z = (y_best - predictions[i]) / sigma[i] + ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z)) + return ei_array + def compute_unique(a): # keep only unique values in the ndarray a # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array @@ -70,18 +78,14 @@ class GPSearchCV(object): """ Examples -------- - >>> parameters = {'kernel' : ['rbf','poly'], - ... 'd' : [1,3], - ... 'C' : [1,10] } - >>> parameters_details = {'kernel' : 'cat', - ... 'd' : 'int', - ... 'C' : 'float'} + >>> parameters = {'kernel' : ['cat', ['rbf','poly']], + ... 'd' : ['int', [1,3]], + ... 'C' : ['float',[1,10])} """ def __init__(self, parameters, - parameters_details, estimator, scoring=None, X=None,y=None, @@ -97,13 +101,12 @@ def __init__(self, self.parameters = parameters self.n_parameters = len(parameters) - self.parameters_details = parameters_details self.acquisition_function = acquisition_function self.n_iter = n_iter self.n_init = n_init self.n_candidates = n_candidates self.param_names = parameters.keys() - self.param_isInt = np.array([ 0 if (parameters_details[k]=='float') else 1 for k in self.param_names ]) + self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) self.param_bounds = np.zeros((self.n_parameters,2)) self.gp_nugget = gp_nugget self.verbose = verbose @@ -127,17 +130,16 @@ def __init__(self, # init param_bounds for i in range(self.n_parameters): - if(parameters_details[self.param_names[i]]=='cat'): + if(parameters[self.param_names[i]][0]=='cat'): self.param_bounds[i,0] = 0 - self.param_bounds[i,1] = len(parameters[self.param_names[i]]) + self.param_bounds[i,1] = len(parameters[self.param_names[i]][1]) else: - self.param_bounds[i] = np.array(parameters[self.param_names[i]]) - if(parameters_details[self.param_names[i]]=='int'): + self.param_bounds[i] = np.array(parameters[self.param_names[i]][1]) + if(parameters[self.param_names[i]][0]=='int'): self.param_bounds[i,1] += 1 if(self.verbose): print(self.parameters) - print(self.parameters_details) print(self.param_names) print(self.param_isInt) print(self.param_bounds) @@ -146,9 +148,9 @@ def __init__(self, def vector_to_dict(self,vector_parameter): dict_parameter = dict.fromkeys(self.param_names) for i in range(self.n_parameters): - if(self.parameters_details[self.param_names[i]]=='cat'): - dict_parameter[self.param_names[i]] = self.parameters[self.param_names[i]][int(vector_parameter[i])] - elif(self.parameters_details[self.param_names[i]]=='int'): + if(self.parameters[self.param_names[i]][0]=='cat'): + dict_parameter[self.param_names[i]] = (self.parameters[self.param_names[i]][1])[int(vector_parameter[i])] + elif(self.parameters[self.param_names[i]][0]=='int'): dict_parameter[self.param_names[i]] = int(vector_parameter[i]) else: dict_parameter[self.param_names[i]] = vector_parameter[i] @@ -222,6 +224,12 @@ def _fit(self): upperBound = predictions + 1.96*np.sqrt(MSE) best_candidate = candidates[np.argmax(upperBound)] + elif(self.acquisition_function == 'EI'): + predictions,MSE = gp.predict(candidates,eval_MSE=True) + y_best = np.max(cv_scores) + ei = compute_ei(predictions,np.sqrt(MSE),y_best) + best_candidate = candidates[np.argmax(ei)] + else: print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function) From bb9fb488c29e25fabe4032a486a5653c25c1cb9a Mon Sep 17 00:00:00 2001 From: Sebastien Date: Fri, 7 Aug 2015 17:55:05 +0200 Subject: [PATCH 6/9] example/update and fixed typo --- examples/model_selection/gp_search.py | 155 ++++++++++++-------------- 1 file changed, 72 insertions(+), 83 deletions(-) diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py index 72ff41c280df7..4b47e645536bb 100644 --- a/examples/model_selection/gp_search.py +++ b/examples/model_selection/gp_search.py @@ -27,31 +27,23 @@ def test1(): clf = RandomForestClassifier(n_estimators=20) # specify parameters and distributions to sample from - parameters = {"max_depth": [3, 3], - "max_features": [1,11], - "min_samples_split": [1,11], - "min_samples_leaf": [1,11], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]} - - parameters_details = {"max_depth": 'int', - "max_features": 'int', - "min_samples_split": 'int', - "min_samples_leaf": 'int', - "bootstrap": 'cat', - "criterion": 'cat'} - - search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20) + parameters = {"max_depth": ['int',[3, 3]], + "max_features": ['int',[1,11]], + "min_samples_split": ['int',[1,11]], + "min_samples_leaf": ['int',[1,11]], + "bootstrap": ['cat',[True, False]], + "criterion": ['cat',["gini", "entropy"]]} + + search = GPSearchCV(parameters,estimator=clf,X=X,y=y,n_iter=20) search._fit() def test2(): - parameters = {'kernel' : ['rbf','poly'],'d' : [1,3],'C' : [1,10] } - parameters_details = {'kernel' : 'cat','d' : 'int','C' : 'float'} + parameters = {'kernel' : ['cat',['rbf','poly']],'d' : ['int',[1,3]],'C' : ['float',[1,10]] } def scoring_function(x): return 0.5 - search = GPSearchCV(parameters,parameters_details,estimator=scoring_function,n_iter=20) + search = GPSearchCV(parameters,estimator=scoring_function,n_iter=20) search._fit() @@ -87,32 +79,21 @@ def test3(): # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { - 'vect__max_df': [0.5,1.], + 'vect__max_df': ['float',[0.5,1.]], #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': [(1, 1), (1, 2)], # unigrams or bigrams + 'vect__ngram_range': ['cat',[(1, 1), (1, 2)]], # unigrams or bigrams #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': [0.000001, 0.00001], - 'clf__penalty': ['l2', 'elasticnet'], + 'clf__alpha': ['float',[0.000001, 0.00001]], + 'clf__penalty': ['cat',['l2', 'elasticnet']] #'clf__n_iter': (10, 50, 80), } - parameters_details = { - 'vect__max_df': 'float', - #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': 'cat' , # unigrams or bigrams - #'tfidf__use_idf': (True, False), - #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': 'float', - 'clf__penalty': 'cat', - #'clf__n_iter': (10, 50, 80), - } - - search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=data.data, y=data.target,n_iter=20) + search = GPSearchCV(parameters,estimator=pipeline,X=data.data, y=data.target,n_iter=20) search._fit() -def gp_vs_random_search(test_name,n_tests,search_lenght): +def gp_vs_random_search(test_name,n_tests,search_lenght,save_data=False): """ Compare GP-based search vs a simple random one Choose test_name in {'iris','text'} @@ -123,22 +104,15 @@ def gp_vs_random_search(test_name,n_tests,search_lenght): if(test_name == 'iris'): iris = load_digits() X, y = iris.data, iris.target - pipeline = RandomForestClassifier(n_estimators=20) + pipeline = RandomForestClassifier() # specify parameters and distributions to sample from - parameters = {"max_depth": [2, 15], - "max_features": [1,20], - "min_samples_split": [1,20], - "min_samples_leaf": [1,20], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]} - - parameters_details = {"max_depth": 'int', - "max_features": 'int', - "min_samples_split": 'int', - "min_samples_leaf": 'int', - "bootstrap": 'cat', - "criterion": 'cat'} + parameters = {"max_depth": ['int',[3, 3]], + "max_features": ['int',[1,11]], + "min_samples_split": ['int',[1,11]], + "min_samples_leaf": ['int',[1,11]], + "bootstrap": ['cat',[True, False]], + "criterion": ['cat',["gini", "entropy"]]} elif(test_name == 'text'): # Display progress logs on stdout @@ -173,54 +147,66 @@ def gp_vs_random_search(test_name,n_tests,search_lenght): # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { - 'vect__max_df': [0.5,1.], - #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': [(1, 1), (1, 2)], # unigrams or bigrams - #'tfidf__use_idf': (True, False), - #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': [0.000001, 0.00001], - 'clf__penalty': ['l2', 'elasticnet'], - #'clf__n_iter': (10, 50, 80), - } - - parameters_details = { - 'vect__max_df': 'float', - #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': 'cat' , # unigrams or bigrams - #'tfidf__use_idf': (True, False), - #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': 'float', - 'clf__penalty': 'cat', - #'clf__n_iter': (10, 50, 80), + 'vect__max_df': ['float',[0.5,1.]], + #'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': ['cat',[(1, 1), (1, 2)]], # unigrams or bigrams + #'tfidf__use_idf': (True, False), + #'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': ['float',[0.000001, 0.00001]], + 'clf__penalty': ['cat',['l2', 'elasticnet']] + #'clf__n_iter': (10, 50, 80), } else: print('Dataset not available for test') - # GP search - all_gp_results = [] - print 'GP search' + # GP UCB search + all_gp_ucb_results = [] + print 'GP_ucb search' for i in range(n_tests): - search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y, + ucb_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, + acquisition_function='UCB', n_iter=n_iter_search, n_init=20, verbose=False) - _,scores = search._fit() + _,scores = ucb_search._fit() max_scores = [scores[0]] print 'Test',i,'-',len(scores),'parameters tested' for j in range(1,len(scores)): max_scores.append(max(max_scores[j-1],scores[j])) - all_gp_results.append(extend_result(n_iter_search,max_scores)) - all_gp_results = np.asarray(all_gp_results) - print all_gp_results.shape + all_gp_ucb_results.append(extend_result(n_iter_search,max_scores)) + all_gp_ucb_results = np.asarray(all_gp_ucb_results) + print all_gp_ucb_results.shape + if(save_data): + np.savetxt('gp_ucb_scores.csv',all_gp_ucb_results,delimiter=',') + + # # GP EI search + # all_gp_ei_results = [] + # print 'GP_ei search' + # for i in range(n_tests): + # ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, + # acquisition_function='EI', + # n_iter=n_iter_search, n_init=20, verbose=False) + # _,scores = ei_search._fit() + + # max_scores = [scores[0]] + # print 'Test',i,'-',len(scores),'parameters tested' + + # for j in range(1,len(scores)): + # max_scores.append(max(max_scores[j-1],scores[j])) + # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) + # all_gp_ei_results = np.asarray(all_gp_ei_results) + # print all_gp_ei_results.shape + # if(save_data): + # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') # Randomized search print 'Random search' all_random_results = [] for i in range(n_tests): - random_search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y, + random_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, n_iter=n_iter_search, n_init=n_iter_search, verbose=False) - _,scores = search._fit() + _,scores = random_search._fit() max_scores = [scores[0]] print 'Test',i,'-',len(scores),'parameters tested' @@ -229,11 +215,14 @@ def gp_vs_random_search(test_name,n_tests,search_lenght): max_scores.append(max(max_scores[j-1],scores[j])) all_random_results.append(extend_result(n_iter_search,max_scores)) all_random_results = np.asarray(all_random_results) + if(save_data): + np.savetxt('rand_scores.csv',all_random_results,delimiter=',') plt.figure() - plt.plot(range(n_iter_search),np.mean(all_gp_results,axis=0),'r',label='GP') + # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') + plt.plot(range(n_iter_search),np.mean(all_gp_ucb_results,axis=0),'b',label='GP-UCB') plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random') - plt.legend() + plt.legend(loc=4) plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials') plt.xlabel('Iterations') plt.ylabel('Max CV performance') @@ -245,8 +234,8 @@ def gp_vs_random_search(test_name,n_tests,search_lenght): # print 'Routine Test' # test2() - test_name = 'iris' + test_name = 'text' n_tests = 20 - search_lenght = 50 + search_lenght = 60 print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials' - gp_vs_random_search(test_name,n_tests,search_lenght) \ No newline at end of file + gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True) \ No newline at end of file From 7af5f6a3ef1f9201fdf129fac103bbacc36de3da Mon Sep 17 00:00:00 2001 From: Sebastien Date: Sat, 29 Aug 2015 19:40:58 +0200 Subject: [PATCH 7/9] FIX for pep8 conformity --- sklearn/gp_search.py | 471 +++++++++++++++++++++++-------------------- 1 file changed, 250 insertions(+), 221 deletions(-) diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index 505f3c21ee31a..d0529d779a07f 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -16,246 +16,275 @@ from .metrics.scorer import check_scoring from .base import is_classifier, clone -##################### UTILS ##################### -def sample_candidates(n_candidates,param_bounds,param_isInt): - n_parameters = param_isInt.shape[0] - candidates = [] +# UTILS # - for k in range(n_parameters): - if(param_isInt[k]): - k_sample = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] , - dtype = np.int32) - else: - k_sample = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] ) - candidates.append(k_sample) - candidates = np.asarray(candidates) - candidates = candidates.T +def sample_candidates(n_candidates, param_bounds, param_isInt): - return compute_unique(candidates) + n_parameters = param_isInt.shape[0] + candidates = [] + + for k in range(n_parameters): + if(param_isInt[k]): + k_sample = np.asarray( + np.random.rand(n_candidates) + * np.float(param_bounds[k][1]-param_bounds[k][0]) + + param_bounds[k][0], + dtype=np.int32) + else: + k_sample = np.asarray( + np.random.rand(n_candidates) + * np.float(param_bounds[k][1]-param_bounds[k][0]) + + param_bounds[k][0]) + candidates.append(k_sample) + + candidates = np.asarray(candidates) + candidates = candidates.T + + return compute_unique(candidates) + + +def compute_ei(predictions, sigma, y_best): + ei_array = np.zeros(predictions.shape[0]) + for i in range(ei_array.shape[0]): + z = (y_best - predictions[i]) / sigma[i] + ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z)) + return ei_array -def compute_ei(predictions,sigma,y_best): - ei_array = np.zeros(predictions.shape[0]) - for i in range(ei_array.shape[0]): - z = (y_best - predictions[i]) / sigma[i] - ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z)) - return ei_array def compute_unique(a): - # keep only unique values in the ndarray a - # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array + # keep only unique values in the ndarray a + # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array - b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) - _, idx = np.unique(b, return_index=True) - idx =np.sort(idx) + b = np.ascontiguousarray(a).view( + np.dtype((np.void, a.dtype.itemsize * a.shape[1])) + ) + _, idx = np.unique(b, return_index=True) + idx = np.sort(idx) - return a[idx] + return a[idx] -def is_in_ndarray(item,a): - # look for element item in ndarray a - # returns True if item is in a, and its index - - k = 0 - idx_val = np.asarray(range(a.shape[0])) - idxk = range(a.shape[0]) - while( k < a.shape[1]): - idxk = (a[idxk,k]==item[k]) - if(np.sum(idxk > 0)): - k += 1 - idx_val = idx_val[idxk] - idxk = list(idx_val) - else: - return False,0 - return True,idx_val[0] +def is_in_ndarray(item, a): + # look for element item in ndarray a + # returns True if item is in a, and its index + k = 0 + idx_val = np.asarray(range(a.shape[0])) + idxk = range(a.shape[0]) + while(k < a.shape[1]): + idxk = (a[idxk, k] == item[k]) + if(np.sum(idxk > 0)): + k += 1 + idx_val = idx_val[idxk] + idxk = list(idx_val) + else: + return False, 0 + return True, idx_val[0] -##################### GPSearchCV ##################### +# GPSearchCV # class GPSearchCV(object): - """ + """ Examples -------- >>> parameters = {'kernel' : ['cat', ['rbf','poly']], - ... 'd' : ['int', [1,3]], - ... 'C' : ['float',[1,10])} + ... 'd' : ['int', [1,3]], + ... 'C' : ['float',[1,10])} """ - def __init__(self, - parameters, - estimator, - scoring=None, - X=None,y=None, - fit_params=None, - refit=True, - cv=None, - acquisition_function = 'UCB', - n_iter=100, - n_init=10, - n_candidates = 500, - gp_nugget=1.e-10, - verbose=True): - - self.parameters = parameters - self.n_parameters = len(parameters) - self.acquisition_function = acquisition_function - self.n_iter = n_iter - self.n_init = n_init - self.n_candidates = n_candidates - self.param_names = parameters.keys() - self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) - self.param_bounds = np.zeros((self.n_parameters,2)) - self.gp_nugget = gp_nugget - self.verbose = verbose - self.scoring = scoring - self.estimator = estimator - self.fit_params = fit_params if fit_params is not None else {} - self.cv = cv - self.X = X - self.y = y - - if(callable(estimator)): - self._callable_estimator = True - if(verbose): - print('Estimator is a callable and not an sklearn Estimator') - else: - self._callable_estimator = False - - - if not self._callable_estimator: - self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) - - # init param_bounds - for i in range(self.n_parameters): - if(parameters[self.param_names[i]][0]=='cat'): - self.param_bounds[i,0] = 0 - self.param_bounds[i,1] = len(parameters[self.param_names[i]][1]) - else: - self.param_bounds[i] = np.array(parameters[self.param_names[i]][1]) - if(parameters[self.param_names[i]][0]=='int'): - self.param_bounds[i,1] += 1 - - if(self.verbose): - print(self.parameters) - print(self.param_names) - print(self.param_isInt) - print(self.param_bounds) - - - def vector_to_dict(self,vector_parameter): - dict_parameter = dict.fromkeys(self.param_names) - for i in range(self.n_parameters): - if(self.parameters[self.param_names[i]][0]=='cat'): - dict_parameter[self.param_names[i]] = (self.parameters[self.param_names[i]][1])[int(vector_parameter[i])] - elif(self.parameters[self.param_names[i]][0]=='int'): - dict_parameter[self.param_names[i]] = int(vector_parameter[i]) - else: - dict_parameter[self.param_names[i]] = vector_parameter[i] - - return dict_parameter - - def score(self,test_parameter): - if not self._callable_estimator: - cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) - cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_, - train, test, False, test_parameter, - self.fit_params, return_parameters=True) - for train, test in cv ] - - n_test_samples = 0 - score = 0 - for tmp_score, tmp_n_test_samples, _, _ in cv_score: - tmp_score *= tmp_n_test_samples - n_test_samples += tmp_n_test_samples - score += tmp_score - score /= float(n_test_samples) - - else: - score = self.estimator(test_parameter) - - return score - - - def _fit(self): - - n_tested_parameters = 0 - tested_parameters = np.zeros((self.n_iter,self.n_parameters)) - cv_scores = np.zeros(self.n_iter) - - ### Initialize with random candidates ### - init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt) - self.n_init = init_candidates.shape[0] - - for i in range(self.n_init): - dict_candidate = self.vector_to_dict(init_candidates[i,:]) - cv_score = self.score(dict_candidate) - - if(self.verbose): - print ('Step ' + str(i) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score)) - - is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:]) - if not is_in: - tested_parameters[n_tested_parameters,:] = init_candidates[i,:] - cv_scores[n_tested_parameters] = cv_score - n_tested_parameters += 1 - else: - if(self.verbose): - print('Hyperparameter already tesed') - cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. - - - for i in range(self.n_iter-self.n_init): - - # Model with a Gaussian Process - gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) , - thetaL = 0.001 * np.ones(self.n_parameters) , - thetaU = 10. * np.ones(self.n_parameters) , - random_start = 3, - nugget= self.gp_nugget) - gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters]) - - # Sample candidates and predict their corresponding acquisition values - candidates = sample_candidates(self.n_candidates,self.param_bounds,self.param_isInt) - if(self.acquisition_function == 'UCB'): - predictions,MSE = gp.predict(candidates,eval_MSE=True) - upperBound = predictions + 1.96*np.sqrt(MSE) - best_candidate = candidates[np.argmax(upperBound)] - - elif(self.acquisition_function == 'EI'): - predictions,MSE = gp.predict(candidates,eval_MSE=True) - y_best = np.max(cv_scores) - ei = compute_ei(predictions,np.sqrt(MSE),y_best) - best_candidate = candidates[np.argmax(ei)] - - else: - print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function) - - dict_candidate = self.vector_to_dict(best_candidate) - cv_score = self.score(dict_candidate) - if(self.verbose): - print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score)) - - is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:]) - if not is_in: - tested_parameters[n_tested_parameters,:] = best_candidate - cv_scores[n_tested_parameters] = cv_score - n_tested_parameters += 1 - else: - if(self.verbose): - print('Hyperparameter already tesed') - cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. - - best_idx = np.argmax(cv_scores[:n_tested_parameters]) - vector_best_param = tested_parameters[best_idx] - best_parameter = self.vector_to_dict(vector_best_param) - - if(self.verbose): - print ('\nTested ' + str(n_tested_parameters) + ' parameters') - print ('Max cv score ' + str(cv_scores[best_idx])) - print ('Best parameter ' + str(tested_parameters[best_idx])) - print(best_parameter) - - return tested_parameters[:n_tested_parameters,:], cv_scores[:n_tested_parameters] \ No newline at end of file + def __init__(self, + parameters, + estimator, + scoring=None, + X=None, y=None, + fit_params=None, + refit=True, + cv=None, + acquisition_function='UCB', + n_iter=100, + n_init=10, + n_candidates=500, + gp_nugget=1.e-10, + verbose=True): + + self.parameters = parameters + self.n_parameters = len(parameters) + self.acquisition_function = acquisition_function + self.n_iter = n_iter + self.n_init = n_init + self.n_candidates = n_candidates + self.param_names = parameters.keys() + self.param_isInt = np.array([0 if (parameters[k][0] == 'float') + else 1 for k in self.param_names]) + self.param_bounds = np.zeros((self.n_parameters, 2)) + self.gp_nugget = gp_nugget + self.verbose = verbose + self.scoring = scoring + self.estimator = estimator + self.fit_params = fit_params if fit_params is not None else {} + self.cv = cv + self.X = X + self.y = y + + if(callable(estimator)): + self._callable_estimator = True + if(verbose): + print('Estimator is a callable and not an sklearn Estimator') + else: + self._callable_estimator = False + + if not self._callable_estimator: + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) + + # init param_bounds + for i in range(self.n_parameters): + if(parameters[self.param_names[i]][0] == 'cat'): + self.param_bounds[i, 0] = 0 + self.param_bounds[i, 1] = \ + len(parameters[self.param_names[i]][1]) + else: + self.param_bounds[i] = \ + np.array(parameters[self.param_names[i]][1]) + if(parameters[self.param_names[i]][0] == 'int'): + self.param_bounds[i, 1] += 1 + + if(self.verbose): + print(self.parameters) + print(self.param_names) + print(self.param_isInt) + print(self.param_bounds) + + def vector_to_dict(self, vector_parameter): + dict_parameter = dict.fromkeys(self.param_names) + for i in range(self.n_parameters): + if(self.parameters[self.param_names[i]][0] == 'cat'): + dict_parameter[self.param_names[i]] = \ + (self.parameters[self.param_names[i]][1])[ + int(vector_parameter[i])] + elif(self.parameters[self.param_names[i]][0] == 'int'): + dict_parameter[self.param_names[i]] = int(vector_parameter[i]) + else: + dict_parameter[self.param_names[i]] = vector_parameter[i] + + return dict_parameter + + def score(self, test_parameter): + if not self._callable_estimator: + cv = check_cv(self.cv, self.X, self.y, + classifier=is_classifier(self.estimator)) + cv_score = [_fit_and_score( + clone(self.estimator), self.X, self.y, self.scorer_, + train, test, False, test_parameter, + self.fit_params, return_parameters=True) + for train, test in cv] + + n_test_samples = 0 + score = 0 + for tmp_score, tmp_n_test_samples, _, _ in cv_score: + tmp_score *= tmp_n_test_samples + n_test_samples += tmp_n_test_samples + score += tmp_score + score /= float(n_test_samples) + + else: + score = self.estimator(test_parameter) + + return score + + def _fit(self): + + n_tested_parameters = 0 + tested_parameters = np.zeros((self.n_iter, self.n_parameters)) + cv_scores = np.zeros(self.n_iter) + + # Initialize with random candidates # + init_candidates = sample_candidates( + self.n_init, self.param_bounds, self.param_isInt) + self.n_init = init_candidates.shape[0] + + for i in range(self.n_init): + dict_candidate = self.vector_to_dict(init_candidates[i, :]) + cv_score = self.score(dict_candidate) + + if(self.verbose): + print ('Step ' + str(i) + ' - Hyperparameter ' + + str(dict_candidate) + ' ' + str(cv_score)) + + is_in, idx = is_in_ndarray( + init_candidates[i, :], + tested_parameters[:n_tested_parameters, :]) + if not is_in: + tested_parameters[n_tested_parameters, :] = \ + init_candidates[i, :] + cv_scores[n_tested_parameters] = cv_score + n_tested_parameters += 1 + else: + if(self.verbose): + print('Hyperparameter already tesed') + cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. + + for i in range(self.n_iter-self.n_init): + + # Model with a Gaussian Process + gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters), + thetaL=0.001 * np.ones(self.n_parameters), + thetaU=10. * np.ones(self.n_parameters), + random_start=3, + nugget=self.gp_nugget) + gp.fit(tested_parameters[:n_tested_parameters, :], + cv_scores[:n_tested_parameters]) + + # Sample candidates and predict their corresponding + # acquisition values + candidates = sample_candidates(self.n_candidates, + self.param_bounds, + self.param_isInt) + if(self.acquisition_function == 'UCB'): + predictions, MSE = gp.predict(candidates, eval_MSE=True) + upperBound = predictions + 1.96*np.sqrt(MSE) + best_candidate = candidates[np.argmax(upperBound)] + + elif(self.acquisition_function == 'EI'): + predictions, MSE = gp.predict(candidates, eval_MSE=True) + y_best = np.max(cv_scores) + ei = compute_ei(predictions, np.sqrt(MSE), y_best) + best_candidate = candidates[np.argmax(ei)] + + else: + print('WARNING : acquisition_function not implemented yet : ' + + self.acquisition_function) + + dict_candidate = self.vector_to_dict(best_candidate) + cv_score = self.score(dict_candidate) + if(self.verbose): + print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + + str(dict_candidate) + ' ' + str(cv_score)) + + is_in, idx = is_in_ndarray( + best_candidate, + tested_parameters[:n_tested_parameters, :]) + if not is_in: + tested_parameters[n_tested_parameters, :] = best_candidate + cv_scores[n_tested_parameters] = cv_score + n_tested_parameters += 1 + else: + if(self.verbose): + print('Hyperparameter already tesed') + cv_scores[idx] = (cv_scores[idx] + cv_score) / 2. + + best_idx = np.argmax(cv_scores[:n_tested_parameters]) + vector_best_param = tested_parameters[best_idx] + best_parameter = self.vector_to_dict(vector_best_param) + + if(self.verbose): + print ('\nTested ' + str(n_tested_parameters) + ' parameters') + print ('Max cv score ' + str(cv_scores[best_idx])) + print ('Best parameter ' + str(tested_parameters[best_idx])) + print (best_parameter) + + return tested_parameters[:n_tested_parameters, :], \ + cv_scores[:n_tested_parameters] From c6b193cadcf930319a65df1de8b4f88bb0385e25 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Sat, 29 Aug 2015 22:27:37 +0200 Subject: [PATCH 8/9] FIX pep8 conformity --- examples/model_selection/gp_search.py | 427 +++++++++++++------------- 1 file changed, 216 insertions(+), 211 deletions(-) diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py index 4b47e645536bb..5005d3a1b023d 100644 --- a/examples/model_selection/gp_search.py +++ b/examples/model_selection/gp_search.py @@ -12,230 +12,235 @@ import matplotlib.pyplot as plt import numpy as np -def extend_result(n_tests,tmp_res): - res = np.zeros(n_tests) - l = len(tmp_res) -1 - for i in range(n_tests): - res[i] = tmp_res[min(i,l)] - return res +def extend_result(n_tests, tmp_res): + res = np.zeros(n_tests) + l = len(tmp_res) - 1 + for i in range(n_tests): + res[i] = tmp_res[min(i, l)] + + return res def test1(): - iris = load_digits() - X, y = iris.data, iris.target - clf = RandomForestClassifier(n_estimators=20) + iris = load_digits() + X, y = iris.data, iris.target + clf = RandomForestClassifier(n_estimators=20) - # specify parameters and distributions to sample from - parameters = {"max_depth": ['int',[3, 3]], - "max_features": ['int',[1,11]], - "min_samples_split": ['int',[1,11]], - "min_samples_leaf": ['int',[1,11]], - "bootstrap": ['cat',[True, False]], - "criterion": ['cat',["gini", "entropy"]]} + # specify parameters and distributions to sample from + parameters = {"max_depth": ['int', [3, 3]], + "max_features": ['int', [1, 11]], + "min_samples_split": ['int', [1, 11]], + "min_samples_leaf": ['int', [1, 11]], + "bootstrap": ['cat', [True, False]], + "criterion": ['cat', ["gini", "entropy"]]} - search = GPSearchCV(parameters,estimator=clf,X=X,y=y,n_iter=20) - search._fit() + search = GPSearchCV(parameters, estimator=clf, X=X, y=y, n_iter=20) + search._fit() def test2(): - parameters = {'kernel' : ['cat',['rbf','poly']],'d' : ['int',[1,3]],'C' : ['float',[1,10]] } - def scoring_function(x): - return 0.5 + parameters = {'kernel': ['cat', ['rbf', 'poly']], + 'd': ['int', [1, 3]], + 'C': ['float', [1, 10]]} - search = GPSearchCV(parameters,estimator=scoring_function,n_iter=20) - search._fit() + def scoring_function(x): + return 0.5 + search = GPSearchCV(parameters, estimator=scoring_function, n_iter=20) + search._fit() -def test3(): - # Display progress logs on stdout - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - # Load some categories from the training set - categories = [ - 'alt.atheism', - 'talk.religion.misc', - ] - # Uncomment the following to do the analysis on all the categories - #categories = None - - print("Loading 20 newsgroups dataset for categories:") - print(categories) - - data = fetch_20newsgroups(subset='train', categories=categories) - print("%d documents" % len(data.filenames)) - print("%d categories" % len(data.target_names)) - print() - - # define a pipeline combining a text feature extractor with a simple - # classifier - pipeline = Pipeline([ - ('vect', CountVectorizer()), - ('tfidf', TfidfTransformer()), - ('clf', SGDClassifier()), - ]) - - # uncommenting more parameters will give better exploring power but will - # increase processing time in a combinatorial way - parameters = { - 'vect__max_df': ['float',[0.5,1.]], - #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': ['cat',[(1, 1), (1, 2)]], # unigrams or bigrams - #'tfidf__use_idf': (True, False), - #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': ['float',[0.000001, 0.00001]], - 'clf__penalty': ['cat',['l2', 'elasticnet']] - #'clf__n_iter': (10, 50, 80), - } - - search = GPSearchCV(parameters,estimator=pipeline,X=data.data, y=data.target,n_iter=20) - search._fit() - - -def gp_vs_random_search(test_name,n_tests,search_lenght,save_data=False): - """ - Compare GP-based search vs a simple random one - Choose test_name in {'iris','text'} - """ - - n_iter_search = search_lenght - - if(test_name == 'iris'): - iris = load_digits() - X, y = iris.data, iris.target - pipeline = RandomForestClassifier() - - # specify parameters and distributions to sample from - parameters = {"max_depth": ['int',[3, 3]], - "max_features": ['int',[1,11]], - "min_samples_split": ['int',[1,11]], - "min_samples_leaf": ['int',[1,11]], - "bootstrap": ['cat',[True, False]], - "criterion": ['cat',["gini", "entropy"]]} - - elif(test_name == 'text'): - # Display progress logs on stdout - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - # Load some categories from the training set - categories = [ - 'alt.atheism', - 'talk.religion.misc', - ] - # Uncomment the following to do the analysis on all the categories - #categories = None - print("Loading 20 newsgroups dataset for categories:") - print(categories) - - data = fetch_20newsgroups(subset='train', categories=categories) - print("%d documents" % len(data.filenames)) - print("%d categories" % len(data.target_names)) - - X = data.data - y = data.target - - # define a pipeline combining a text feature extractor with a simple - # classifier - pipeline = Pipeline([ - ('vect', CountVectorizer()), - ('tfidf', TfidfTransformer()), - ('clf', SGDClassifier()), - ]) - - # uncommenting more parameters will give better exploring power but will - # increase processing time in a combinatorial way - parameters = { - 'vect__max_df': ['float',[0.5,1.]], - #'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': ['cat',[(1, 1), (1, 2)]], # unigrams or bigrams - #'tfidf__use_idf': (True, False), - #'tfidf__norm': ('l1', 'l2'), - 'clf__alpha': ['float',[0.000001, 0.00001]], - 'clf__penalty': ['cat',['l2', 'elasticnet']] - #'clf__n_iter': (10, 50, 80), - } - - else: - print('Dataset not available for test') - - # GP UCB search - all_gp_ucb_results = [] - print 'GP_ucb search' - for i in range(n_tests): - ucb_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, - acquisition_function='UCB', - n_iter=n_iter_search, n_init=20, verbose=False) - _,scores = ucb_search._fit() - - max_scores = [scores[0]] - print 'Test',i,'-',len(scores),'parameters tested' - - for j in range(1,len(scores)): - max_scores.append(max(max_scores[j-1],scores[j])) - all_gp_ucb_results.append(extend_result(n_iter_search,max_scores)) - all_gp_ucb_results = np.asarray(all_gp_ucb_results) - print all_gp_ucb_results.shape - if(save_data): - np.savetxt('gp_ucb_scores.csv',all_gp_ucb_results,delimiter=',') - - # # GP EI search - # all_gp_ei_results = [] - # print 'GP_ei search' - # for i in range(n_tests): - # ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, - # acquisition_function='EI', - # n_iter=n_iter_search, n_init=20, verbose=False) - # _,scores = ei_search._fit() - - # max_scores = [scores[0]] - # print 'Test',i,'-',len(scores),'parameters tested' - - # for j in range(1,len(scores)): - # max_scores.append(max(max_scores[j-1],scores[j])) - # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) - # all_gp_ei_results = np.asarray(all_gp_ei_results) - # print all_gp_ei_results.shape - # if(save_data): - # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') - - # Randomized search - print 'Random search' - all_random_results = [] - for i in range(n_tests): - random_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, - n_iter=n_iter_search, n_init=n_iter_search, verbose=False) - _,scores = random_search._fit() - - max_scores = [scores[0]] - print 'Test',i,'-',len(scores),'parameters tested' - - for j in range(1,len(scores)): - max_scores.append(max(max_scores[j-1],scores[j])) - all_random_results.append(extend_result(n_iter_search,max_scores)) - all_random_results = np.asarray(all_random_results) - if(save_data): - np.savetxt('rand_scores.csv',all_random_results,delimiter=',') - - plt.figure() - # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') - plt.plot(range(n_iter_search),np.mean(all_gp_ucb_results,axis=0),'b',label='GP-UCB') - plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random') - plt.legend(loc=4) - plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials') - plt.xlabel('Iterations') - plt.ylabel('Max CV performance') - plt.show() +def test3(): + # Display progress logs on stdout + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') + + # Load some categories from the training set + categories = [ + 'alt.atheism', + 'talk.religion.misc', + ] + # Uncomment the following to do the analysis on all the categories + # categories = None + + print("Loading 20 newsgroups dataset for categories:") + print(categories) + + data = fetch_20newsgroups(subset='train', categories=categories) + print("%d documents" % len(data.filenames)) + print("%d categories" % len(data.target_names)) + print() + + # define a pipeline combining a text feature extractor with a simple + # classifier + pipeline = Pipeline([ + ('vect', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('clf', SGDClassifier()), + ]) + + # uncommenting more parameters will give better exploring power but will + # increase processing time in a combinatorial way + parameters = { + 'vect__max_df': ['float', [0.5, 1.]], + # 'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams + # 'tfidf__use_idf': (True, False), + # 'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': ['float', [0.000001, 0.00001]], + 'clf__penalty': ['cat', ['l2', 'elasticnet']] + # 'clf__n_iter': (10, 50, 80), + } + + search = GPSearchCV(parameters, estimator=pipeline, X=data.data, y=data.target, n_iter=20) + search._fit() + + +def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False): + """ + Compare GP-based search vs a simple random one + Choose test_name in {'iris', 'text'} + """ + + n_iter_search = search_lenght + + if(test_name == 'iris'): + iris = load_digits() + X, y = iris.data, iris.target + pipeline = RandomForestClassifier() + + # specify parameters and distributions to sample from + parameters = {"max_depth": ['int', [3, 3]], + "max_features": ['int', [1, 11]], + "min_samples_split": ['int', [1, 11]], + "min_samples_leaf": ['int', [1, 11]], + "bootstrap": ['cat', [True, False]], + "criterion": ['cat', ["gini", "entropy"]]} + + elif(test_name == 'text'): + # Display progress logs on stdout + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') + + # Load some categories from the training set + categories = [ + 'alt.atheism', + 'talk.religion.misc', + ] + # Uncomment the following to do the analysis on all the categories + # categories = None + print("Loading 20 newsgroups dataset for categories:") + print(categories) + + data = fetch_20newsgroups(subset='train', categories=categories) + print("%d documents" % len(data.filenames)) + print("%d categories" % len(data.target_names)) + + X = data.data + y = data.target + + # define a pipeline combining a text feature extractor with a simple + # classifier + pipeline = Pipeline([ + ('vect', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('clf', SGDClassifier()), + ]) + + # uncommenting more parameters will give better exploring power but will + # increase processing time in a combinatorial way + parameters = { + 'vect__max_df': ['float', [0.5, 1.]], + # 'vect__max_features': (None, 5000, 10000, 50000), + 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams + # 'tfidf__use_idf': (True, False), + # 'tfidf__norm': ('l1', 'l2'), + 'clf__alpha': ['float', [0.000001, 0.00001]], + 'clf__penalty': ['cat', ['l2', 'elasticnet']] + # 'clf__n_iter': (10, 50, 80), + } + + else: + print('Dataset not available for test') + + # GP UCB search + all_gp_ucb_results = [] + print 'GP_ucb search' + for i in range(n_tests): + ucb_search = GPSearchCV(parameters, estimator=pipeline, X=X, y=y, + acquisition_function='UCB', + n_iter=n_iter_search, n_init=20, verbose=False) + _, scores = ucb_search._fit() + + max_scores = [scores[0]] + print 'Test', i, '-', len(scores), 'parameters tested' + + for j in range(1, len(scores)): + max_scores.append(max(max_scores[j-1], scores[j])) + all_gp_ucb_results.append(extend_result(n_iter_search, max_scores)) + all_gp_ucb_results = np.asarray(all_gp_ucb_results) + print all_gp_ucb_results.shape + if(save_data): + np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',') + + # # GP EI search + # all_gp_ei_results = [] + # print 'GP_ei search' + # for i in range(n_tests): + # ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y, + # acquisition_function='EI', + # n_iter=n_iter_search, n_init=20, verbose=False) + # _,scores = ei_search._fit() + + # max_scores = [scores[0]] + # print 'Test',i,'-',len(scores),'parameters tested' + + # for j in range(1,len(scores)): + # max_scores.append(max(max_scores[j-1],scores[j])) + # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) + # all_gp_ei_results = np.asarray(all_gp_ei_results) + # print all_gp_ei_results.shape + # if(save_data): + # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') + + # Randomized search + print 'Random search' + all_random_results = [] + for i in range(n_tests): + random_search = GPSearchCV(parameters, estimator=pipeline, X=X, y=y, + n_iter=n_iter_search, n_init=n_iter_search, verbose=False) + _, scores = random_search._fit() + + max_scores = [scores[0]] + print 'Test', i, '-', len(scores), 'parameters tested' + + for j in range(1, len(scores)): + max_scores.append(max(max_scores[j-1], scores[j])) + all_random_results.append(extend_result(n_iter_search, max_scores)) + all_random_results = np.asarray(all_random_results) + if(save_data): + np.savetxt('rand_scores.csv', all_random_results, delimiter=',') + + plt.figure() + # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') + plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB') + plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random') + plt.legend(loc=4) + plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials') + plt.xlabel('Iterations') + plt.ylabel('Max CV performance') + plt.show() if __name__ == "__main__": - - # print 'Routine Test' - # test2() - test_name = 'text' - n_tests = 20 - search_lenght = 60 - print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials' - gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True) \ No newline at end of file + + print 'Routine Test' + test2() + test1() + + # test_name = 'text' + # n_tests = 20 + # search_lenght = 60 + # print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials' + # gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True) From 7aeb5f7bd67c53cc8b0f3bc3d04e12aa69f043e9 Mon Sep 17 00:00:00 2001 From: Sebastien Date: Sat, 29 Aug 2015 22:27:59 +0200 Subject: [PATCH 9/9] Added documentation --- sklearn/gp_search.py | 140 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 5 deletions(-) diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py index d0529d779a07f..cb667d992d088 100644 --- a/sklearn/gp_search.py +++ b/sklearn/gp_search.py @@ -19,7 +19,6 @@ # UTILS # - def sample_candidates(n_candidates, param_bounds, param_isInt): n_parameters = param_isInt.shape[0] @@ -50,6 +49,7 @@ def compute_ei(predictions, sigma, y_best): for i in range(ei_array.shape[0]): z = (y_best - predictions[i]) / sigma[i] ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z)) + return ei_array @@ -88,11 +88,110 @@ def is_in_ndarray(item, a): # GPSearchCV # class GPSearchCV(object): """ + Parameters + ---------- + + parameters : dict, parameter space on which to optimize the estimator + The keys of the dictionnary should be the names of the parameters, + and the values should be lists of length 2; the first element being + the type of the parameter ('int', 'float' or 'cat' [for categorical]), + and the second element being a list of either the bounds between which + to search (for 'int' and 'float') or the values the parameter can take + (for 'cat') + Example : parameters = {'kernel' : ['cat', ['rbf', 'poly']], + 'd' : ['int', [1,3]], + 'C' : ['float',[1,10])} + + estimator : 1) sklearn estimator or 2) callable + 1 : object type that implements the "fit" and "predict" methods, + as a classifier or a pipeline + 2 : a function that computes the output given a dictionnary of + parameters. The returned value should be a list of one or more + floats if score_format == 'cv', and a float if score_format == + 'avg' + + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples in the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + fit_params : dict, optional + Parameters to pass to the fit method. + + scoring : string, callable or None, optional + A string (see sklearn's model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + Default is None. + + cv : integer or cross-validation generator, optional + Relevant if the estimator is an sklearn object. + If an integer is passed, it is the number of folds. + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + Default is 5. + + acquisition function : string, optional + Function to maximize in order to choose the next parameter to test. + - Simple : maximize the predicted output + - UCB : maximize the upper confidence bound + - EI : maximizes the expected improvement + Default is 'UCB' + + n_iter : int + Total number of iterations to perform (including n_init and + n_final_iter). + Default is 100. + + n_init : int, optional + Number of random iterations to perform before the smart search. + Default is 30. + + n_final_iter : int, optional + Number of final iterations, ie. smart iterations but with + acquisition_function == 'Simple' + Default is 5. + + n_candidates : int, optional + Number of random candidates to sample for each GP iterations + Default is 500. + + nugget : float, optional + The nugget to set for the Gaussian Process. + Default is 1.e-10. + + + Attributes + ---------- + + best_parameter_ : dict, the parameter set, from those tested by the + method _fit, that maximizes the mean of the cross-validation results. + + tested_parameters_ : ndarray, the parameters tested by _fit + + Examples - -------- - >>> parameters = {'kernel' : ['cat', ['rbf','poly']], - ... 'd' : ['int', [1,3]], - ... 'C' : ['float',[1,10])} + ------- + >>> from sklearn.datasets import load_digits + >>> iris = load_digits() + >>> X, y = iris.data, iris.target + >>> clf = RandomForestClassifier(n_estimators=20) + >>> parameters = {"max_depth": ['int', [3, 3]], + "max_features": ['int', [1, 11]], + "min_samples_split": ['int', [1, 11]], + "min_samples_leaf": ['int', [1, 11]], + "bootstrap": ['cat', [True, False]], + "criterion": ['cat', ["gini", "entropy"]]} + + >>> search = GPSearchCV(parameters, + estimator=clf, + X=X, + y=y, + n_iter=20) + >>> search._fit() """ @@ -130,6 +229,10 @@ def __init__(self, self.X = X self.y = y + self.best_parameter_ = None + self.tested_parameters_ = None + self.cv_scores_ = None + if(callable(estimator)): self._callable_estimator = True if(verbose): @@ -173,6 +276,19 @@ def vector_to_dict(self, vector_parameter): return dict_parameter def score(self, test_parameter): + """ + The score function to call in order to evaluate the quality + of the parameter test_parameter + + Parameters + ---------- + tested_parameter : dict, the parameter to test + + Returns + ------- + score : the mean of the CV score + """ + if not self._callable_estimator: cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator)) @@ -196,6 +312,15 @@ def score(self, test_parameter): return score def _fit(self): + """ + Run the hyper-parameter optimization process + + Returns + ------- + tested_parameters_ : ndarray, the parameters tested during the process + + cv_scores_ : array of the mean CV results of the parameters tested + """ n_tested_parameters = 0 tested_parameters = np.zeros((self.n_iter, self.n_parameters)) @@ -280,6 +405,11 @@ def _fit(self): vector_best_param = tested_parameters[best_idx] best_parameter = self.vector_to_dict(vector_best_param) + # store + self.best_parameter_ = best_parameter + self.tested_parameters_ = tested_parameters[:n_tested_parameters, :] + self.cv_scores_ = cv_scores[:n_tested_parameters] + if(self.verbose): print ('\nTested ' + str(n_tested_parameters) + ' parameters') print ('Max cv score ' + str(cv_scores[best_idx]))