From 21ef8ba96de935afdcdd7600aa279f78aa1c7414 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Thu, 30 Jul 2015 00:24:46 +0200
Subject: [PATCH 1/9] Initial gp-based hyperopt code

---
 sklearn/gp_search.py | 262 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 sklearn/gp_search.py

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
new file mode 100644
index 0000000000000..8498890bdd6cc
--- /dev/null
+++ b/sklearn/gp_search.py
@@ -0,0 +1,262 @@
+"""
+The :mod:`sklearn.gp_search` includes utilities to fine-tune the parameters
+of an estimator through a Gaussian Process model.
+"""
+from __future__ import print_function
+
+# Author: Sebastien Dubois <http://bit.ly/SebastienDubois>,
+# License: BSD 3 clause
+
+import numpy as np
+
+from .gaussian_process.gaussian_process import GaussianProcess
+from .cross_validation import check_cv
+from .cross_validation import _fit_and_score
+from .metrics.scorer import check_scoring
+from .ensemble import RandomForestClassifier
+from .base import is_classifier, clone
+
+#####################    UTILS    #####################
+
+def sample_candidates(n_candidates,param_bounds,param_isInt):
+	n_parameters = param_isInt.shape[0]
+	candidates = []
+
+	for k in range(n_parameters):
+		if(param_isInt[k]):
+			k_sample  = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] ,
+								dtype = np.int32)
+		else:
+			k_sample  = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] )
+		candidates.append(k_sample)
+
+	candidates = np.asarray(candidates)
+	candidates = candidates.T
+
+	return compute_unique(candidates)
+
+def compute_unique(a):
+	# keep only unique values in the ndarray a
+	# http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array
+
+	b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
+	_, idx = np.unique(b, return_index=True)
+	idx =np.sort(idx)
+
+	return a[idx]
+
+def is_in_ndarray(item,a):
+	# look for element item in ndarray a
+	# returns True if item is in a, and its index
+	
+	k = 0
+	idx_val = np.asarray(range(a.shape[0]))
+	idxk = range(a.shape[0])
+	while( k < a.shape[1]):
+		idxk =  (a[idxk,k]==item[k])
+		if(np.sum(idxk > 0)):
+			k += 1
+			idx_val = idx_val[idxk]
+			idxk = list(idx_val) 
+		else:
+			return False,0
+
+	return True,idx_val[0]
+
+
+
+#####################    GPSearchCV    #####################
+
+class GPSearchCV(object):
+	"""
+    Examples
+    --------
+    >>> parameters = {'kernel' : ['rbf','poly'],
+    ...   			   'd' : [1,3],
+    ...  			   'C' : [1,10] }
+    >>> parameters_details = {'kernel' : 'cat',
+    ...   			   'd' : 'int',
+    ...  			   'C' : 'float'}
+
+    """
+
+	def __init__(self,
+				parameters,
+				parameters_details,
+				estimator, scoring=None,
+				X=None,y=None,
+				fit_params=None,
+				refit=True, 
+				cv=None,
+				acquisition_function = 'UCB',
+				n_iter=100,
+				n_init=10,
+				n_candidates = 500,
+				gp_nugget=1.e-10,
+				verbose=True):
+
+		self.parameters = parameters
+		self.n_parameters = len(parameters)
+		self.parameters_details = parameters_details
+		self.acquisition_function = acquisition_function
+		self.n_iter = n_iter
+		self.n_init = n_init
+		self.n_candidates = n_candidates
+		self.param_names = parameters.keys()
+		self.param_isInt = np.array([ 0 if (parameters_details[k]=='float') else 1 for k in self.param_names ]) 
+		self.param_bounds = np.zeros((self.n_parameters,2))
+		self.gp_nugget = gp_nugget
+		self.verbose = verbose
+		self.scoring = scoring
+		self.estimator = estimator
+		self.fit_params = fit_params if fit_params is not None else {}
+		self.cv = cv
+		self.X = X
+		self.y = y
+
+		self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+
+		# init param_bounds
+		for i in range(self.n_parameters):
+			if(parameters_details[self.param_names[i]]=='cat'):
+				self.param_bounds[i,0] = 0
+				self.param_bounds[i,1] = len(parameters[self.param_names[i]])
+			else:
+				self.param_bounds[i] = np.array(parameters[self.param_names[i]])
+				if(parameters_details[self.param_names[i]]=='int'):
+					self.param_bounds[i,1] += 1
+
+		if(self.verbose):
+			print(self.parameters)
+			print(self.parameters_details)
+			print(self.param_names)
+			print(self.param_isInt)
+			print(self.param_bounds)
+
+
+	def vector_to_dict(self,vector_parameter):
+		dict_parameter = dict.fromkeys(self.param_names)
+		for i in range(self.n_parameters):
+			if(self.parameters_details[self.param_names[i]]=='cat'):
+				dict_parameter[self.param_names[i]] = self.parameters[self.param_names[i]][int(vector_parameter[i])]
+			elif(self.parameters_details[self.param_names[i]]=='int'):
+				dict_parameter[self.param_names[i]] = int(vector_parameter[i])
+			else:
+				dict_parameter[self.param_names[i]] = vector_parameter[i]
+
+		return dict_parameter
+
+	def score(self,test_parameter):
+ 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
+ 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
+						train, test, False, test_parameter,
+						self.fit_params, return_parameters=True)
+					for train, test in cv ]
+
+		n_test_samples = 0
+		score = 0
+		for tmp_score, tmp_n_test_samples, _, _ in cv_score:
+			tmp_score *= tmp_n_test_samples
+			n_test_samples += tmp_n_test_samples
+			score += tmp_score
+		score /= float(n_test_samples)
+
+		return score
+
+
+	def fit(self):
+
+		n_tested_parameters = 0
+		tested_parameters = np.zeros((self.n_iter,self.n_parameters))
+		cv_scores = np.zeros(self.n_iter)
+
+		###    Initialize with random candidates    ### 
+		init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt)
+
+		for i in range(self.n_init):
+			cand = self.vector_to_dict(init_candidates[i,:])
+			cv_score = self.score(cand)
+
+			if(self.verbose):
+				print ('Step ' + str(i) + ' - Hyperparameter ' + str(init_candidates[i,:]) + ' ' + str(cv_score))
+
+			is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:])
+			if not is_in:
+				tested_parameters[n_tested_parameters,:] = init_candidates[i,:]
+				cv_scores[n_tested_parameters] = cv_score
+				n_tested_parameters += 1
+			else:
+				if(verbose):
+					print('Hyperparameter already tesed')
+				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
+
+
+		for i in range(self.n_iter-self.n_init):
+			
+			# Model with a Gaussian Process
+			gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) ,
+								 thetaL = 0.001 * np.ones(self.n_parameters) ,
+								 thetaU = 10. * np.ones(self.n_parameters) ,
+								 nugget= self.gp_nugget) 
+			gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters])
+
+			# Sample candidates and predict their corresponding acquisition values
+			candidates = sample_candidates(self.n_candidates,self.param_bounds,self.param_isInt)
+			if(self.acquisition_function == 'UCB'):
+				predictions,MSE = gp.predict(candidates,eval_MSE=True)
+				upperBound = predictions + 1.96*np.sqrt(MSE)
+				best_candidate = candidates[np.argmax(upperBound)]
+
+			else:
+				print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function)
+
+			cv_score = self.score(self.vector_to_dict(best_candidate))
+			if(self.verbose):
+				print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(best_candidate) + ' ' + str(cv_score))
+
+			is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:])
+			if not is_in:
+				tested_parameters[n_tested_parameters,:] = best_candidate
+				cv_scores[n_tested_parameters] = cv_score
+				n_tested_parameters += 1
+			else:
+				if(verbose):
+					print('Hyperparameter already tesed')
+				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
+
+		best_idx = np.argmax(cv_scores[:n_tested_parameters])
+		vector_best_param = tested_parameters[best_idx]
+		best_parameter = self.vector_to_dict(vector_best_param)
+
+		if(self.verbose):
+			print ('\nTested ' + str(n_tested_parameters) + ' parameters')
+			print ('Max cv score ' + str(cv_scores[best_idx]))
+			print ('Best parameter ' + str(tested_parameters[best_idx]))
+			print(best_parameter)
+
+
+def test():
+	from sklearn.datasets import load_digits
+	iris = load_digits()
+	X, y = iris.data, iris.target
+	clf = RandomForestClassifier(n_estimators=20)
+
+	# specify parameters and distributions to sample from
+	parameters = {"max_depth": [3, 3],
+					"max_features": [1,11],
+					"min_samples_split": [1,11],
+					"min_samples_leaf": [1,11],
+					"bootstrap": [True, False],
+					"criterion": ["gini", "entropy"]}
+
+	parameters_details = {"max_depth": 'int',
+					"max_features": 'int',
+					"min_samples_split": 'int',
+					"min_samples_leaf": 'int',
+					"bootstrap": 'cat',
+					"criterion": 'cat'}
+
+	search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20)
+	search.fit()
+
+test()
\ No newline at end of file

From 37ebc2621206cde522a99d36c75939703bf33446 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Thu, 30 Jul 2015 15:11:01 +0200
Subject: [PATCH 2/9] GPSearchCV handles both sklearn Estimators and custom
 score functions

---
 sklearn/gp_search.py | 86 ++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 48 deletions(-)

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index 8498890bdd6cc..3f5d54c2438a4 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -13,7 +13,6 @@
 from .cross_validation import check_cv
 from .cross_validation import _fit_and_score
 from .metrics.scorer import check_scoring
-from .ensemble import RandomForestClassifier
 from .base import is_classifier, clone
 
 #####################    UTILS    #####################
@@ -83,7 +82,8 @@ class GPSearchCV(object):
 	def __init__(self,
 				parameters,
 				parameters_details,
-				estimator, scoring=None,
+				estimator,
+				scoring=None,
 				X=None,y=None,
 				fit_params=None,
 				refit=True, 
@@ -114,7 +114,16 @@ def __init__(self,
 		self.X = X
 		self.y = y
 
-		self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+		if(callable(estimator)):
+			self._callable_estimator = True
+			if(verbose):
+				print('Estimator is a callable and not an sklearn Estimator')
+		else:
+			self._callable_estimator = False
+
+
+		if not self._callable_estimator:
+			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
 
 		# init param_bounds
 		for i in range(self.n_parameters):
@@ -147,24 +156,28 @@ def vector_to_dict(self,vector_parameter):
 		return dict_parameter
 
 	def score(self,test_parameter):
- 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
- 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
-						train, test, False, test_parameter,
-						self.fit_params, return_parameters=True)
-					for train, test in cv ]
-
-		n_test_samples = 0
-		score = 0
-		for tmp_score, tmp_n_test_samples, _, _ in cv_score:
-			tmp_score *= tmp_n_test_samples
-			n_test_samples += tmp_n_test_samples
-			score += tmp_score
-		score /= float(n_test_samples)
+		if not self._callable_estimator:
+	 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
+	 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
+							train, test, False, test_parameter,
+							self.fit_params, return_parameters=True)
+						for train, test in cv ]
+
+			n_test_samples = 0
+			score = 0
+			for tmp_score, tmp_n_test_samples, _, _ in cv_score:
+				tmp_score *= tmp_n_test_samples
+				n_test_samples += tmp_n_test_samples
+				score += tmp_score
+			score /= float(n_test_samples)
+
+		else:
+			score = self.estimator(test_parameter)
 
 		return score
 
 
-	def fit(self):
+	def _fit(self):
 
 		n_tested_parameters = 0
 		tested_parameters = np.zeros((self.n_iter,self.n_parameters))
@@ -172,13 +185,14 @@ def fit(self):
 
 		###    Initialize with random candidates    ### 
 		init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt)
+		self.n_init = init_candidates.shape[0]
 
 		for i in range(self.n_init):
-			cand = self.vector_to_dict(init_candidates[i,:])
-			cv_score = self.score(cand)
+			dict_candidate = self.vector_to_dict(init_candidates[i,:])
+			cv_score = self.score(dict_candidate)
 
 			if(self.verbose):
-				print ('Step ' + str(i) + ' - Hyperparameter ' + str(init_candidates[i,:]) + ' ' + str(cv_score))
+				print ('Step ' + str(i) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score))
 
 			is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:])
 			if not is_in:
@@ -210,9 +224,10 @@ def fit(self):
 			else:
 				print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function)
 
-			cv_score = self.score(self.vector_to_dict(best_candidate))
+			dict_candidate = self.vector_to_dict(best_candidate)
+			cv_score = self.score(dict_candidate)
 			if(self.verbose):
-				print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(best_candidate) + ' ' + str(cv_score))
+				print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score))
 
 			is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:])
 			if not is_in:
@@ -234,29 +249,4 @@ def fit(self):
 			print ('Best parameter ' + str(tested_parameters[best_idx]))
 			print(best_parameter)
 
-
-def test():
-	from sklearn.datasets import load_digits
-	iris = load_digits()
-	X, y = iris.data, iris.target
-	clf = RandomForestClassifier(n_estimators=20)
-
-	# specify parameters and distributions to sample from
-	parameters = {"max_depth": [3, 3],
-					"max_features": [1,11],
-					"min_samples_split": [1,11],
-					"min_samples_leaf": [1,11],
-					"bootstrap": [True, False],
-					"criterion": ["gini", "entropy"]}
-
-	parameters_details = {"max_depth": 'int',
-					"max_features": 'int',
-					"min_samples_split": 'int',
-					"min_samples_leaf": 'int',
-					"bootstrap": 'cat',
-					"criterion": 'cat'}
-
-	search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20)
-	search.fit()
-
-test()
\ No newline at end of file
+		return tested_parameters[:n_tested_parameters,:], cv_scores[:n_tested_parameters]
\ No newline at end of file

From e7677b6690ffda4a20588cb3a8ce15a238a4e0f7 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Thu, 30 Jul 2015 15:15:08 +0200
Subject: [PATCH 3/9] FIX typo

---
 sklearn/gp_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index 3f5d54c2438a4..44e1c201e2303 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -200,7 +200,7 @@ def _fit(self):
 				cv_scores[n_tested_parameters] = cv_score
 				n_tested_parameters += 1
 			else:
-				if(verbose):
+				if(self.verbose):
 					print('Hyperparameter already tesed')
 				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
 
@@ -235,7 +235,7 @@ def _fit(self):
 				cv_scores[n_tested_parameters] = cv_score
 				n_tested_parameters += 1
 			else:
-				if(verbose):
+				if(self.verbose):
 					print('Hyperparameter already tesed')
 				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
 

From 2a043a44618e1d898d084c99bc106fc12c6203b3 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Thu, 30 Jul 2015 19:02:52 +0200
Subject: [PATCH 4/9] Adding example for GPSearchCV

---
 examples/model_selection/gp_search.py | 252 ++++++++++++++++++++++++++
 sklearn/gp_search.py                  |   1 +
 2 files changed, 253 insertions(+)
 create mode 100644 examples/model_selection/gp_search.py

diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py
new file mode 100644
index 0000000000000..72ff41c280df7
--- /dev/null
+++ b/examples/model_selection/gp_search.py
@@ -0,0 +1,252 @@
+from sklearn.datasets import load_digits
+from sklearn.gp_search import GPSearchCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.linear_model import SGDClassifier
+from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
+from sklearn.pipeline import Pipeline
+
+import logging
+import matplotlib.pyplot as plt
+import numpy as np
+
+def extend_result(n_tests,tmp_res):
+	res = np.zeros(n_tests)
+	l = len(tmp_res) -1
+	for i in range(n_tests):
+		res[i] = tmp_res[min(i,l)]
+
+	return res
+
+
+def test1():
+	iris = load_digits()
+	X, y = iris.data, iris.target
+	clf = RandomForestClassifier(n_estimators=20)
+
+	# specify parameters and distributions to sample from
+	parameters = {"max_depth": [3, 3],
+					"max_features": [1,11],
+					"min_samples_split": [1,11],
+					"min_samples_leaf": [1,11],
+					"bootstrap": [True, False],
+					"criterion": ["gini", "entropy"]}
+
+	parameters_details = {"max_depth": 'int',
+					"max_features": 'int',
+					"min_samples_split": 'int',
+					"min_samples_leaf": 'int',
+					"bootstrap": 'cat',
+					"criterion": 'cat'}
+
+	search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20)
+	search._fit()
+
+
+def test2():
+	parameters = {'kernel' : ['rbf','poly'],'d' : [1,3],'C' : [1,10] }
+	parameters_details = {'kernel' : 'cat','d' : 'int','C' : 'float'}
+	def scoring_function(x):
+		return 0.5
+
+	search = GPSearchCV(parameters,parameters_details,estimator=scoring_function,n_iter=20)
+	search._fit()
+
+
+def test3():
+	# Display progress logs on stdout
+	logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s %(levelname)s %(message)s')
+
+	# Load some categories from the training set
+	categories = [
+	    'alt.atheism',
+	    'talk.religion.misc',
+	]
+	# Uncomment the following to do the analysis on all the categories
+	#categories = None
+
+	print("Loading 20 newsgroups dataset for categories:")
+	print(categories)
+
+	data = fetch_20newsgroups(subset='train', categories=categories)
+	print("%d documents" % len(data.filenames))
+	print("%d categories" % len(data.target_names))
+	print()
+
+	# define a pipeline combining a text feature extractor with a simple
+	# classifier
+	pipeline = Pipeline([
+	    ('vect', CountVectorizer()),
+	    ('tfidf', TfidfTransformer()),
+	    ('clf', SGDClassifier()),
+	])
+
+	# uncommenting more parameters will give better exploring power but will
+	# increase processing time in a combinatorial way
+	parameters = {
+	    'vect__max_df': [0.5,1.],
+	    #'vect__max_features': (None, 5000, 10000, 50000),
+	    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
+	    #'tfidf__use_idf': (True, False),
+	    #'tfidf__norm': ('l1', 'l2'),
+	    'clf__alpha': [0.000001, 0.00001],
+	    'clf__penalty': ['l2', 'elasticnet'],
+	    #'clf__n_iter': (10, 50, 80),
+	}
+
+	parameters_details = {
+	    'vect__max_df': 'float',
+	    #'vect__max_features': (None, 5000, 10000, 50000),
+	    'vect__ngram_range': 'cat' ,  # unigrams or bigrams
+	    #'tfidf__use_idf': (True, False),
+	    #'tfidf__norm': ('l1', 'l2'),
+	    'clf__alpha': 'float',
+	    'clf__penalty': 'cat',
+	    #'clf__n_iter': (10, 50, 80),
+	}
+
+	search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=data.data, y=data.target,n_iter=20)
+	search._fit()
+
+
+def gp_vs_random_search(test_name,n_tests,search_lenght):
+	"""
+	Compare GP-based search vs a simple random one
+	Choose test_name in {'iris','text'}
+	"""
+
+	n_iter_search = search_lenght
+
+	if(test_name == 'iris'):
+		iris = load_digits()
+		X, y = iris.data, iris.target
+		pipeline = RandomForestClassifier(n_estimators=20)
+
+		# specify parameters and distributions to sample from
+		parameters = {"max_depth": [2, 15],
+						"max_features": [1,20],
+						"min_samples_split": [1,20],
+						"min_samples_leaf": [1,20],
+						"bootstrap": [True, False],
+						"criterion": ["gini", "entropy"]}
+
+		parameters_details = {"max_depth": 'int',
+						"max_features": 'int',
+						"min_samples_split": 'int',
+						"min_samples_leaf": 'int',
+						"bootstrap": 'cat',
+						"criterion": 'cat'}
+
+	elif(test_name == 'text'):
+		# Display progress logs on stdout
+		logging.basicConfig(level=logging.INFO,
+	                    format='%(asctime)s %(levelname)s %(message)s')
+
+		# Load some categories from the training set
+		categories = [
+		    'alt.atheism',
+		    'talk.religion.misc',
+		]
+		# Uncomment the following to do the analysis on all the categories
+		#categories = None
+		print("Loading 20 newsgroups dataset for categories:")
+		print(categories)
+
+		data = fetch_20newsgroups(subset='train', categories=categories)
+		print("%d documents" % len(data.filenames))
+		print("%d categories" % len(data.target_names))
+
+		X = data.data
+		y = data.target
+
+		# define a pipeline combining a text feature extractor with a simple
+		# classifier
+		pipeline = Pipeline([
+		    ('vect', CountVectorizer()),
+		    ('tfidf', TfidfTransformer()),
+		    ('clf', SGDClassifier()),
+		])
+
+		# uncommenting more parameters will give better exploring power but will
+		# increase processing time in a combinatorial way
+		parameters = {
+		    'vect__max_df': [0.5,1.],
+		    #'vect__max_features': (None, 5000, 10000, 50000),
+		    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
+		    #'tfidf__use_idf': (True, False),
+		    #'tfidf__norm': ('l1', 'l2'),
+		    'clf__alpha': [0.000001, 0.00001],
+		    'clf__penalty': ['l2', 'elasticnet'],
+		    #'clf__n_iter': (10, 50, 80),
+		}
+
+		parameters_details = {
+		    'vect__max_df': 'float',
+		    #'vect__max_features': (None, 5000, 10000, 50000),
+		    'vect__ngram_range': 'cat' ,  # unigrams or bigrams
+		    #'tfidf__use_idf': (True, False),
+		    #'tfidf__norm': ('l1', 'l2'),
+		    'clf__alpha': 'float',
+		    'clf__penalty': 'cat',
+		    #'clf__n_iter': (10, 50, 80),
+		}
+
+	else:
+		print('Dataset not available for test')
+
+	# GP search
+	all_gp_results = []
+	print 'GP search'
+	for i in range(n_tests):
+		search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y,
+							n_iter=n_iter_search, n_init=20, verbose=False)
+		_,scores = search._fit()
+
+		max_scores = [scores[0]]
+		print 'Test',i,'-',len(scores),'parameters tested'
+
+		for j in range(1,len(scores)):
+			max_scores.append(max(max_scores[j-1],scores[j]))
+		all_gp_results.append(extend_result(n_iter_search,max_scores))
+	all_gp_results = np.asarray(all_gp_results)
+	print all_gp_results.shape
+
+	# Randomized search
+	print 'Random search'
+	all_random_results = []
+	for i in range(n_tests):
+		random_search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y,
+                                  	n_iter=n_iter_search, n_init=n_iter_search, verbose=False)
+		_,scores = search._fit()
+
+		max_scores = [scores[0]]
+		print 'Test',i,'-',len(scores),'parameters tested'
+
+		for j in range(1,len(scores)):
+			max_scores.append(max(max_scores[j-1],scores[j]))
+		all_random_results.append(extend_result(n_iter_search,max_scores))
+	all_random_results = np.asarray(all_random_results)
+
+	plt.figure()
+	plt.plot(range(n_iter_search),np.mean(all_gp_results,axis=0),'r',label='GP')
+	plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random')
+	plt.legend()
+	plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials')
+	plt.xlabel('Iterations')
+	plt.ylabel('Max CV performance')
+	plt.show()
+
+
+
+if __name__ == "__main__":
+	
+	# print 'Routine Test'
+	# test2()
+	test_name = 'iris'
+	n_tests = 20
+	search_lenght = 50
+	print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials'
+	gp_vs_random_search(test_name,n_tests,search_lenght)
\ No newline at end of file
diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index 44e1c201e2303..28c70876eaba4 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -211,6 +211,7 @@ def _fit(self):
 			gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) ,
 								 thetaL = 0.001 * np.ones(self.n_parameters) ,
 								 thetaU = 10. * np.ones(self.n_parameters) ,
+								 random_start = 3,
 								 nugget= self.gp_nugget) 
 			gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters])
 

From 69b56f49d49e34612b8e970781276bc59d9e5ba6 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Fri, 7 Aug 2015 17:53:33 +0200
Subject: [PATCH 5/9] Added EI and changed parameter descriptions

---
 sklearn/gp_search.py | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index 28c70876eaba4..505f3c21ee31a 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -8,6 +8,7 @@
 # License: BSD 3 clause
 
 import numpy as np
+from scipy.stats import norm
 
 from .gaussian_process.gaussian_process import GaussianProcess
 from .cross_validation import check_cv
@@ -34,6 +35,13 @@ def sample_candidates(n_candidates,param_bounds,param_isInt):
 
 	return compute_unique(candidates)
 
+def compute_ei(predictions,sigma,y_best):
+	ei_array = np.zeros(predictions.shape[0])
+	for i in range(ei_array.shape[0]):
+		z = (y_best - predictions[i]) / sigma[i]
+		ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z))
+	return ei_array
+
 def compute_unique(a):
 	# keep only unique values in the ndarray a
 	# http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array
@@ -70,18 +78,14 @@ class GPSearchCV(object):
 	"""
     Examples
     --------
-    >>> parameters = {'kernel' : ['rbf','poly'],
-    ...   			   'd' : [1,3],
-    ...  			   'C' : [1,10] }
-    >>> parameters_details = {'kernel' : 'cat',
-    ...   			   'd' : 'int',
-    ...  			   'C' : 'float'}
+    >>> parameters = {'kernel' :  ['cat', ['rbf','poly']],
+    ...   			   'd' : ['int', [1,3]],
+    ...  			   'C' : ['float',[1,10])}
 
     """
 
 	def __init__(self,
 				parameters,
-				parameters_details,
 				estimator,
 				scoring=None,
 				X=None,y=None,
@@ -97,13 +101,12 @@ def __init__(self,
 
 		self.parameters = parameters
 		self.n_parameters = len(parameters)
-		self.parameters_details = parameters_details
 		self.acquisition_function = acquisition_function
 		self.n_iter = n_iter
 		self.n_init = n_init
 		self.n_candidates = n_candidates
 		self.param_names = parameters.keys()
-		self.param_isInt = np.array([ 0 if (parameters_details[k]=='float') else 1 for k in self.param_names ]) 
+		self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) 
 		self.param_bounds = np.zeros((self.n_parameters,2))
 		self.gp_nugget = gp_nugget
 		self.verbose = verbose
@@ -127,17 +130,16 @@ def __init__(self,
 
 		# init param_bounds
 		for i in range(self.n_parameters):
-			if(parameters_details[self.param_names[i]]=='cat'):
+			if(parameters[self.param_names[i]][0]=='cat'):
 				self.param_bounds[i,0] = 0
-				self.param_bounds[i,1] = len(parameters[self.param_names[i]])
+				self.param_bounds[i,1] = len(parameters[self.param_names[i]][1])
 			else:
-				self.param_bounds[i] = np.array(parameters[self.param_names[i]])
-				if(parameters_details[self.param_names[i]]=='int'):
+				self.param_bounds[i] = np.array(parameters[self.param_names[i]][1])
+				if(parameters[self.param_names[i]][0]=='int'):
 					self.param_bounds[i,1] += 1
 
 		if(self.verbose):
 			print(self.parameters)
-			print(self.parameters_details)
 			print(self.param_names)
 			print(self.param_isInt)
 			print(self.param_bounds)
@@ -146,9 +148,9 @@ def __init__(self,
 	def vector_to_dict(self,vector_parameter):
 		dict_parameter = dict.fromkeys(self.param_names)
 		for i in range(self.n_parameters):
-			if(self.parameters_details[self.param_names[i]]=='cat'):
-				dict_parameter[self.param_names[i]] = self.parameters[self.param_names[i]][int(vector_parameter[i])]
-			elif(self.parameters_details[self.param_names[i]]=='int'):
+			if(self.parameters[self.param_names[i]][0]=='cat'):
+				dict_parameter[self.param_names[i]] = (self.parameters[self.param_names[i]][1])[int(vector_parameter[i])]
+			elif(self.parameters[self.param_names[i]][0]=='int'):
 				dict_parameter[self.param_names[i]] = int(vector_parameter[i])
 			else:
 				dict_parameter[self.param_names[i]] = vector_parameter[i]
@@ -222,6 +224,12 @@ def _fit(self):
 				upperBound = predictions + 1.96*np.sqrt(MSE)
 				best_candidate = candidates[np.argmax(upperBound)]
 
+			elif(self.acquisition_function == 'EI'):
+				predictions,MSE = gp.predict(candidates,eval_MSE=True)
+				y_best = np.max(cv_scores)
+				ei = compute_ei(predictions,np.sqrt(MSE),y_best)
+				best_candidate = candidates[np.argmax(ei)]
+
 			else:
 				print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function)
 

From bb9fb488c29e25fabe4032a486a5653c25c1cb9a Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Fri, 7 Aug 2015 17:55:05 +0200
Subject: [PATCH 6/9] example/update and fixed typo

---
 examples/model_selection/gp_search.py | 155 ++++++++++++--------------
 1 file changed, 72 insertions(+), 83 deletions(-)

diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py
index 72ff41c280df7..4b47e645536bb 100644
--- a/examples/model_selection/gp_search.py
+++ b/examples/model_selection/gp_search.py
@@ -27,31 +27,23 @@ def test1():
 	clf = RandomForestClassifier(n_estimators=20)
 
 	# specify parameters and distributions to sample from
-	parameters = {"max_depth": [3, 3],
-					"max_features": [1,11],
-					"min_samples_split": [1,11],
-					"min_samples_leaf": [1,11],
-					"bootstrap": [True, False],
-					"criterion": ["gini", "entropy"]}
-
-	parameters_details = {"max_depth": 'int',
-					"max_features": 'int',
-					"min_samples_split": 'int',
-					"min_samples_leaf": 'int',
-					"bootstrap": 'cat',
-					"criterion": 'cat'}
-
-	search = GPSearchCV(parameters,parameters_details,estimator=clf,X=X,y=y,n_iter=20)
+	parameters = {"max_depth": ['int',[3, 3]],
+					"max_features": ['int',[1,11]],
+					"min_samples_split": ['int',[1,11]],
+					"min_samples_leaf": ['int',[1,11]],
+					"bootstrap": ['cat',[True, False]],
+					"criterion": ['cat',["gini", "entropy"]]}
+
+	search = GPSearchCV(parameters,estimator=clf,X=X,y=y,n_iter=20)
 	search._fit()
 
 
 def test2():
-	parameters = {'kernel' : ['rbf','poly'],'d' : [1,3],'C' : [1,10] }
-	parameters_details = {'kernel' : 'cat','d' : 'int','C' : 'float'}
+	parameters = {'kernel' : ['cat',['rbf','poly']],'d' : ['int',[1,3]],'C' : ['float',[1,10]] }
 	def scoring_function(x):
 		return 0.5
 
-	search = GPSearchCV(parameters,parameters_details,estimator=scoring_function,n_iter=20)
+	search = GPSearchCV(parameters,estimator=scoring_function,n_iter=20)
 	search._fit()
 
 
@@ -87,32 +79,21 @@ def test3():
 	# uncommenting more parameters will give better exploring power but will
 	# increase processing time in a combinatorial way
 	parameters = {
-	    'vect__max_df': [0.5,1.],
+	    'vect__max_df': ['float',[0.5,1.]],
 	    #'vect__max_features': (None, 5000, 10000, 50000),
-	    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
+	    'vect__ngram_range': ['cat',[(1, 1), (1, 2)]],  # unigrams or bigrams
 	    #'tfidf__use_idf': (True, False),
 	    #'tfidf__norm': ('l1', 'l2'),
-	    'clf__alpha': [0.000001, 0.00001],
-	    'clf__penalty': ['l2', 'elasticnet'],
+	    'clf__alpha': ['float',[0.000001, 0.00001]],
+	    'clf__penalty': ['cat',['l2', 'elasticnet']]
 	    #'clf__n_iter': (10, 50, 80),
 	}
 
-	parameters_details = {
-	    'vect__max_df': 'float',
-	    #'vect__max_features': (None, 5000, 10000, 50000),
-	    'vect__ngram_range': 'cat' ,  # unigrams or bigrams
-	    #'tfidf__use_idf': (True, False),
-	    #'tfidf__norm': ('l1', 'l2'),
-	    'clf__alpha': 'float',
-	    'clf__penalty': 'cat',
-	    #'clf__n_iter': (10, 50, 80),
-	}
-
-	search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=data.data, y=data.target,n_iter=20)
+	search = GPSearchCV(parameters,estimator=pipeline,X=data.data, y=data.target,n_iter=20)
 	search._fit()
 
 
-def gp_vs_random_search(test_name,n_tests,search_lenght):
+def gp_vs_random_search(test_name,n_tests,search_lenght,save_data=False):
 	"""
 	Compare GP-based search vs a simple random one
 	Choose test_name in {'iris','text'}
@@ -123,22 +104,15 @@ def gp_vs_random_search(test_name,n_tests,search_lenght):
 	if(test_name == 'iris'):
 		iris = load_digits()
 		X, y = iris.data, iris.target
-		pipeline = RandomForestClassifier(n_estimators=20)
+		pipeline = RandomForestClassifier()
 
 		# specify parameters and distributions to sample from
-		parameters = {"max_depth": [2, 15],
-						"max_features": [1,20],
-						"min_samples_split": [1,20],
-						"min_samples_leaf": [1,20],
-						"bootstrap": [True, False],
-						"criterion": ["gini", "entropy"]}
-
-		parameters_details = {"max_depth": 'int',
-						"max_features": 'int',
-						"min_samples_split": 'int',
-						"min_samples_leaf": 'int',
-						"bootstrap": 'cat',
-						"criterion": 'cat'}
+		parameters = {"max_depth": ['int',[3, 3]],
+						"max_features": ['int',[1,11]],
+						"min_samples_split": ['int',[1,11]],
+						"min_samples_leaf": ['int',[1,11]],
+						"bootstrap": ['cat',[True, False]],
+						"criterion": ['cat',["gini", "entropy"]]}
 
 	elif(test_name == 'text'):
 		# Display progress logs on stdout
@@ -173,54 +147,66 @@ def gp_vs_random_search(test_name,n_tests,search_lenght):
 		# uncommenting more parameters will give better exploring power but will
 		# increase processing time in a combinatorial way
 		parameters = {
-		    'vect__max_df': [0.5,1.],
-		    #'vect__max_features': (None, 5000, 10000, 50000),
-		    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
-		    #'tfidf__use_idf': (True, False),
-		    #'tfidf__norm': ('l1', 'l2'),
-		    'clf__alpha': [0.000001, 0.00001],
-		    'clf__penalty': ['l2', 'elasticnet'],
-		    #'clf__n_iter': (10, 50, 80),
-		}
-
-		parameters_details = {
-		    'vect__max_df': 'float',
-		    #'vect__max_features': (None, 5000, 10000, 50000),
-		    'vect__ngram_range': 'cat' ,  # unigrams or bigrams
-		    #'tfidf__use_idf': (True, False),
-		    #'tfidf__norm': ('l1', 'l2'),
-		    'clf__alpha': 'float',
-		    'clf__penalty': 'cat',
-		    #'clf__n_iter': (10, 50, 80),
+			'vect__max_df': ['float',[0.5,1.]],
+			#'vect__max_features': (None, 5000, 10000, 50000),
+			'vect__ngram_range': ['cat',[(1, 1), (1, 2)]],  # unigrams or bigrams
+			#'tfidf__use_idf': (True, False),
+			#'tfidf__norm': ('l1', 'l2'),
+			'clf__alpha': ['float',[0.000001, 0.00001]],
+			'clf__penalty': ['cat',['l2', 'elasticnet']]
+			#'clf__n_iter': (10, 50, 80),
 		}
 
 	else:
 		print('Dataset not available for test')
 
-	# GP search
-	all_gp_results = []
-	print 'GP search'
+	# GP UCB search
+	all_gp_ucb_results = []
+	print 'GP_ucb search'
 	for i in range(n_tests):
-		search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y,
+		ucb_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
+							acquisition_function='UCB',
 							n_iter=n_iter_search, n_init=20, verbose=False)
-		_,scores = search._fit()
+		_,scores = ucb_search._fit()
 
 		max_scores = [scores[0]]
 		print 'Test',i,'-',len(scores),'parameters tested'
 
 		for j in range(1,len(scores)):
 			max_scores.append(max(max_scores[j-1],scores[j]))
-		all_gp_results.append(extend_result(n_iter_search,max_scores))
-	all_gp_results = np.asarray(all_gp_results)
-	print all_gp_results.shape
+		all_gp_ucb_results.append(extend_result(n_iter_search,max_scores))
+	all_gp_ucb_results = np.asarray(all_gp_ucb_results)
+	print all_gp_ucb_results.shape
+	if(save_data):
+		np.savetxt('gp_ucb_scores.csv',all_gp_ucb_results,delimiter=',')
+
+	# # GP EI search
+	# all_gp_ei_results = []
+	# print 'GP_ei search'
+	# for i in range(n_tests):
+	# 	ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
+	# 						acquisition_function='EI',
+	# 						n_iter=n_iter_search, n_init=20, verbose=False)
+	# 	_,scores = ei_search._fit()
+
+	# 	max_scores = [scores[0]]
+	# 	print 'Test',i,'-',len(scores),'parameters tested'
+
+	# 	for j in range(1,len(scores)):
+	# 		max_scores.append(max(max_scores[j-1],scores[j]))
+	# 	all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
+	# all_gp_ei_results = np.asarray(all_gp_ei_results)
+	# print all_gp_ei_results.shape
+	# if(save_data):
+	# 	np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')
 
 	# Randomized search
 	print 'Random search'
 	all_random_results = []
 	for i in range(n_tests):
-		random_search = GPSearchCV(parameters,parameters_details,estimator=pipeline,X=X,y=y,
+		random_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
                                   	n_iter=n_iter_search, n_init=n_iter_search, verbose=False)
-		_,scores = search._fit()
+		_,scores = random_search._fit()
 
 		max_scores = [scores[0]]
 		print 'Test',i,'-',len(scores),'parameters tested'
@@ -229,11 +215,14 @@ def gp_vs_random_search(test_name,n_tests,search_lenght):
 			max_scores.append(max(max_scores[j-1],scores[j]))
 		all_random_results.append(extend_result(n_iter_search,max_scores))
 	all_random_results = np.asarray(all_random_results)
+	if(save_data):
+		np.savetxt('rand_scores.csv',all_random_results,delimiter=',')
 
 	plt.figure()
-	plt.plot(range(n_iter_search),np.mean(all_gp_results,axis=0),'r',label='GP')
+	# plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
+	plt.plot(range(n_iter_search),np.mean(all_gp_ucb_results,axis=0),'b',label='GP-UCB')
 	plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random')
-	plt.legend()
+	plt.legend(loc=4)
 	plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials')
 	plt.xlabel('Iterations')
 	plt.ylabel('Max CV performance')
@@ -245,8 +234,8 @@ def gp_vs_random_search(test_name,n_tests,search_lenght):
 	
 	# print 'Routine Test'
 	# test2()
-	test_name = 'iris'
+	test_name = 'text'
 	n_tests = 20
-	search_lenght = 50
+	search_lenght = 60
 	print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials'
-	gp_vs_random_search(test_name,n_tests,search_lenght)
\ No newline at end of file
+	gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True)
\ No newline at end of file

From 7af5f6a3ef1f9201fdf129fac103bbacc36de3da Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Sat, 29 Aug 2015 19:40:58 +0200
Subject: [PATCH 7/9] FIX for pep8 conformity

---
 sklearn/gp_search.py | 471 +++++++++++++++++++++++--------------------
 1 file changed, 250 insertions(+), 221 deletions(-)

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index 505f3c21ee31a..d0529d779a07f 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -16,246 +16,275 @@
 from .metrics.scorer import check_scoring
 from .base import is_classifier, clone
 
-#####################    UTILS    #####################
 
-def sample_candidates(n_candidates,param_bounds,param_isInt):
-	n_parameters = param_isInt.shape[0]
-	candidates = []
+#   UTILS    #
 
-	for k in range(n_parameters):
-		if(param_isInt[k]):
-			k_sample  = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] ,
-								dtype = np.int32)
-		else:
-			k_sample  = np.asarray( np.random.rand(n_candidates) * np.float(param_bounds[k][1]-param_bounds[k][0]) + param_bounds[k][0] )
-		candidates.append(k_sample)
 
-	candidates = np.asarray(candidates)
-	candidates = candidates.T
+def sample_candidates(n_candidates, param_bounds, param_isInt):
 
-	return compute_unique(candidates)
+    n_parameters = param_isInt.shape[0]
+    candidates = []
+
+    for k in range(n_parameters):
+        if(param_isInt[k]):
+            k_sample = np.asarray(
+                np.random.rand(n_candidates)
+                * np.float(param_bounds[k][1]-param_bounds[k][0])
+                + param_bounds[k][0],
+                dtype=np.int32)
+        else:
+            k_sample = np.asarray(
+                np.random.rand(n_candidates)
+                * np.float(param_bounds[k][1]-param_bounds[k][0])
+                + param_bounds[k][0])
+        candidates.append(k_sample)
+
+    candidates = np.asarray(candidates)
+    candidates = candidates.T
+
+    return compute_unique(candidates)
+
+
+def compute_ei(predictions, sigma, y_best):
+    ei_array = np.zeros(predictions.shape[0])
+    for i in range(ei_array.shape[0]):
+        z = (y_best - predictions[i]) / sigma[i]
+        ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z))
+    return ei_array
 
-def compute_ei(predictions,sigma,y_best):
-	ei_array = np.zeros(predictions.shape[0])
-	for i in range(ei_array.shape[0]):
-		z = (y_best - predictions[i]) / sigma[i]
-		ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z))
-	return ei_array
 
 def compute_unique(a):
-	# keep only unique values in the ndarray a
-	# http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array
+    # keep only unique values in the ndarray a
+    # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array
 
-	b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
-	_, idx = np.unique(b, return_index=True)
-	idx =np.sort(idx)
+    b = np.ascontiguousarray(a).view(
+        np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
+        )
+    _, idx = np.unique(b, return_index=True)
+    idx = np.sort(idx)
 
-	return a[idx]
+    return a[idx]
 
-def is_in_ndarray(item,a):
-	# look for element item in ndarray a
-	# returns True if item is in a, and its index
-	
-	k = 0
-	idx_val = np.asarray(range(a.shape[0]))
-	idxk = range(a.shape[0])
-	while( k < a.shape[1]):
-		idxk =  (a[idxk,k]==item[k])
-		if(np.sum(idxk > 0)):
-			k += 1
-			idx_val = idx_val[idxk]
-			idxk = list(idx_val) 
-		else:
-			return False,0
 
-	return True,idx_val[0]
+def is_in_ndarray(item, a):
+    # look for element item in ndarray a
+    # returns True if item is in a, and its index
 
+    k = 0
+    idx_val = np.asarray(range(a.shape[0]))
+    idxk = range(a.shape[0])
+    while(k < a.shape[1]):
+        idxk = (a[idxk, k] == item[k])
+        if(np.sum(idxk > 0)):
+            k += 1
+            idx_val = idx_val[idxk]
+            idxk = list(idx_val)
+        else:
+            return False, 0
 
+    return True, idx_val[0]
 
-#####################    GPSearchCV    #####################
 
+#    GPSearchCV    #
 class GPSearchCV(object):
-	"""
+    """
     Examples
     --------
     >>> parameters = {'kernel' :  ['cat', ['rbf','poly']],
-    ...   			   'd' : ['int', [1,3]],
-    ...  			   'C' : ['float',[1,10])}
+    ...                'd' : ['int', [1,3]],
+    ...                'C' : ['float',[1,10])}
 
     """
 
-	def __init__(self,
-				parameters,
-				estimator,
-				scoring=None,
-				X=None,y=None,
-				fit_params=None,
-				refit=True, 
-				cv=None,
-				acquisition_function = 'UCB',
-				n_iter=100,
-				n_init=10,
-				n_candidates = 500,
-				gp_nugget=1.e-10,
-				verbose=True):
-
-		self.parameters = parameters
-		self.n_parameters = len(parameters)
-		self.acquisition_function = acquisition_function
-		self.n_iter = n_iter
-		self.n_init = n_init
-		self.n_candidates = n_candidates
-		self.param_names = parameters.keys()
-		self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) 
-		self.param_bounds = np.zeros((self.n_parameters,2))
-		self.gp_nugget = gp_nugget
-		self.verbose = verbose
-		self.scoring = scoring
-		self.estimator = estimator
-		self.fit_params = fit_params if fit_params is not None else {}
-		self.cv = cv
-		self.X = X
-		self.y = y
-
-		if(callable(estimator)):
-			self._callable_estimator = True
-			if(verbose):
-				print('Estimator is a callable and not an sklearn Estimator')
-		else:
-			self._callable_estimator = False
-
-
-		if not self._callable_estimator:
-			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
-
-		# init param_bounds
-		for i in range(self.n_parameters):
-			if(parameters[self.param_names[i]][0]=='cat'):
-				self.param_bounds[i,0] = 0
-				self.param_bounds[i,1] = len(parameters[self.param_names[i]][1])
-			else:
-				self.param_bounds[i] = np.array(parameters[self.param_names[i]][1])
-				if(parameters[self.param_names[i]][0]=='int'):
-					self.param_bounds[i,1] += 1
-
-		if(self.verbose):
-			print(self.parameters)
-			print(self.param_names)
-			print(self.param_isInt)
-			print(self.param_bounds)
-
-
-	def vector_to_dict(self,vector_parameter):
-		dict_parameter = dict.fromkeys(self.param_names)
-		for i in range(self.n_parameters):
-			if(self.parameters[self.param_names[i]][0]=='cat'):
-				dict_parameter[self.param_names[i]] = (self.parameters[self.param_names[i]][1])[int(vector_parameter[i])]
-			elif(self.parameters[self.param_names[i]][0]=='int'):
-				dict_parameter[self.param_names[i]] = int(vector_parameter[i])
-			else:
-				dict_parameter[self.param_names[i]] = vector_parameter[i]
-
-		return dict_parameter
-
-	def score(self,test_parameter):
-		if not self._callable_estimator:
-	 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
-	 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
-							train, test, False, test_parameter,
-							self.fit_params, return_parameters=True)
-						for train, test in cv ]
-
-			n_test_samples = 0
-			score = 0
-			for tmp_score, tmp_n_test_samples, _, _ in cv_score:
-				tmp_score *= tmp_n_test_samples
-				n_test_samples += tmp_n_test_samples
-				score += tmp_score
-			score /= float(n_test_samples)
-
-		else:
-			score = self.estimator(test_parameter)
-
-		return score
-
-
-	def _fit(self):
-
-		n_tested_parameters = 0
-		tested_parameters = np.zeros((self.n_iter,self.n_parameters))
-		cv_scores = np.zeros(self.n_iter)
-
-		###    Initialize with random candidates    ### 
-		init_candidates = sample_candidates(self.n_init,self.param_bounds,self.param_isInt)
-		self.n_init = init_candidates.shape[0]
-
-		for i in range(self.n_init):
-			dict_candidate = self.vector_to_dict(init_candidates[i,:])
-			cv_score = self.score(dict_candidate)
-
-			if(self.verbose):
-				print ('Step ' + str(i) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score))
-
-			is_in,idx = is_in_ndarray(init_candidates[i,:],tested_parameters[:n_tested_parameters,:])
-			if not is_in:
-				tested_parameters[n_tested_parameters,:] = init_candidates[i,:]
-				cv_scores[n_tested_parameters] = cv_score
-				n_tested_parameters += 1
-			else:
-				if(self.verbose):
-					print('Hyperparameter already tesed')
-				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
-
-
-		for i in range(self.n_iter-self.n_init):
-			
-			# Model with a Gaussian Process
-			gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters) ,
-								 thetaL = 0.001 * np.ones(self.n_parameters) ,
-								 thetaU = 10. * np.ones(self.n_parameters) ,
-								 random_start = 3,
-								 nugget= self.gp_nugget) 
-			gp.fit(tested_parameters[:n_tested_parameters,:],cv_scores[:n_tested_parameters])
-
-			# Sample candidates and predict their corresponding acquisition values
-			candidates = sample_candidates(self.n_candidates,self.param_bounds,self.param_isInt)
-			if(self.acquisition_function == 'UCB'):
-				predictions,MSE = gp.predict(candidates,eval_MSE=True)
-				upperBound = predictions + 1.96*np.sqrt(MSE)
-				best_candidate = candidates[np.argmax(upperBound)]
-
-			elif(self.acquisition_function == 'EI'):
-				predictions,MSE = gp.predict(candidates,eval_MSE=True)
-				y_best = np.max(cv_scores)
-				ei = compute_ei(predictions,np.sqrt(MSE),y_best)
-				best_candidate = candidates[np.argmax(ei)]
-
-			else:
-				print('WARNING : acquisition_function not implemented yet : ' + self.acquisition_function)
-
-			dict_candidate = self.vector_to_dict(best_candidate)
-			cv_score = self.score(dict_candidate)
-			if(self.verbose):
-				print ('Step ' + str(i+self.n_init) + ' - Hyperparameter ' + str(dict_candidate) + ' ' + str(cv_score))
-
-			is_in,idx = is_in_ndarray(best_candidate,tested_parameters[:n_tested_parameters,:])
-			if not is_in:
-				tested_parameters[n_tested_parameters,:] = best_candidate
-				cv_scores[n_tested_parameters] = cv_score
-				n_tested_parameters += 1
-			else:
-				if(self.verbose):
-					print('Hyperparameter already tesed')
-				cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
-
-		best_idx = np.argmax(cv_scores[:n_tested_parameters])
-		vector_best_param = tested_parameters[best_idx]
-		best_parameter = self.vector_to_dict(vector_best_param)
-
-		if(self.verbose):
-			print ('\nTested ' + str(n_tested_parameters) + ' parameters')
-			print ('Max cv score ' + str(cv_scores[best_idx]))
-			print ('Best parameter ' + str(tested_parameters[best_idx]))
-			print(best_parameter)
-
-		return tested_parameters[:n_tested_parameters,:], cv_scores[:n_tested_parameters]
\ No newline at end of file
+    def __init__(self,
+                 parameters,
+                 estimator,
+                 scoring=None,
+                 X=None, y=None,
+                 fit_params=None,
+                 refit=True,
+                 cv=None,
+                 acquisition_function='UCB',
+                 n_iter=100,
+                 n_init=10,
+                 n_candidates=500,
+                 gp_nugget=1.e-10,
+                 verbose=True):
+
+        self.parameters = parameters
+        self.n_parameters = len(parameters)
+        self.acquisition_function = acquisition_function
+        self.n_iter = n_iter
+        self.n_init = n_init
+        self.n_candidates = n_candidates
+        self.param_names = parameters.keys()
+        self.param_isInt = np.array([0 if (parameters[k][0] == 'float')
+                                     else 1 for k in self.param_names])
+        self.param_bounds = np.zeros((self.n_parameters, 2))
+        self.gp_nugget = gp_nugget
+        self.verbose = verbose
+        self.scoring = scoring
+        self.estimator = estimator
+        self.fit_params = fit_params if fit_params is not None else {}
+        self.cv = cv
+        self.X = X
+        self.y = y
+
+        if(callable(estimator)):
+            self._callable_estimator = True
+            if(verbose):
+                print('Estimator is a callable and not an sklearn Estimator')
+        else:
+            self._callable_estimator = False
+
+        if not self._callable_estimator:
+            self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+
+        # init param_bounds
+        for i in range(self.n_parameters):
+            if(parameters[self.param_names[i]][0] == 'cat'):
+                self.param_bounds[i, 0] = 0
+                self.param_bounds[i, 1] = \
+                    len(parameters[self.param_names[i]][1])
+            else:
+                self.param_bounds[i] = \
+                    np.array(parameters[self.param_names[i]][1])
+                if(parameters[self.param_names[i]][0] == 'int'):
+                    self.param_bounds[i, 1] += 1
+
+        if(self.verbose):
+            print(self.parameters)
+            print(self.param_names)
+            print(self.param_isInt)
+            print(self.param_bounds)
+
+    def vector_to_dict(self, vector_parameter):
+        dict_parameter = dict.fromkeys(self.param_names)
+        for i in range(self.n_parameters):
+            if(self.parameters[self.param_names[i]][0] == 'cat'):
+                dict_parameter[self.param_names[i]] = \
+                    (self.parameters[self.param_names[i]][1])[
+                        int(vector_parameter[i])]
+            elif(self.parameters[self.param_names[i]][0] == 'int'):
+                dict_parameter[self.param_names[i]] = int(vector_parameter[i])
+            else:
+                dict_parameter[self.param_names[i]] = vector_parameter[i]
+
+        return dict_parameter
+
+    def score(self, test_parameter):
+        if not self._callable_estimator:
+            cv = check_cv(self.cv, self.X, self.y,
+                          classifier=is_classifier(self.estimator))
+            cv_score = [_fit_and_score(
+                clone(self.estimator), self.X, self.y, self.scorer_,
+                train, test, False, test_parameter,
+                self.fit_params, return_parameters=True)
+                for train, test in cv]
+
+            n_test_samples = 0
+            score = 0
+            for tmp_score, tmp_n_test_samples, _, _ in cv_score:
+                tmp_score *= tmp_n_test_samples
+                n_test_samples += tmp_n_test_samples
+                score += tmp_score
+            score /= float(n_test_samples)
+
+        else:
+            score = self.estimator(test_parameter)
+
+        return score
+
+    def _fit(self):
+
+        n_tested_parameters = 0
+        tested_parameters = np.zeros((self.n_iter, self.n_parameters))
+        cv_scores = np.zeros(self.n_iter)
+
+        #  Initialize with random candidates  #
+        init_candidates = sample_candidates(
+            self.n_init, self.param_bounds, self.param_isInt)
+        self.n_init = init_candidates.shape[0]
+
+        for i in range(self.n_init):
+            dict_candidate = self.vector_to_dict(init_candidates[i, :])
+            cv_score = self.score(dict_candidate)
+
+            if(self.verbose):
+                print ('Step ' + str(i) + ' - Hyperparameter '
+                       + str(dict_candidate) + ' ' + str(cv_score))
+
+            is_in, idx = is_in_ndarray(
+                init_candidates[i, :],
+                tested_parameters[:n_tested_parameters, :])
+            if not is_in:
+                tested_parameters[n_tested_parameters, :] = \
+                    init_candidates[i, :]
+                cv_scores[n_tested_parameters] = cv_score
+                n_tested_parameters += 1
+            else:
+                if(self.verbose):
+                    print('Hyperparameter already tesed')
+                cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
+
+        for i in range(self.n_iter-self.n_init):
+
+            # Model with a Gaussian Process
+            gp = GaussianProcess(theta0=1. * np.ones(self.n_parameters),
+                                 thetaL=0.001 * np.ones(self.n_parameters),
+                                 thetaU=10. * np.ones(self.n_parameters),
+                                 random_start=3,
+                                 nugget=self.gp_nugget)
+            gp.fit(tested_parameters[:n_tested_parameters, :],
+                   cv_scores[:n_tested_parameters])
+
+            # Sample candidates and predict their corresponding
+            # acquisition values
+            candidates = sample_candidates(self.n_candidates,
+                                           self.param_bounds,
+                                           self.param_isInt)
+            if(self.acquisition_function == 'UCB'):
+                predictions, MSE = gp.predict(candidates, eval_MSE=True)
+                upperBound = predictions + 1.96*np.sqrt(MSE)
+                best_candidate = candidates[np.argmax(upperBound)]
+
+            elif(self.acquisition_function == 'EI'):
+                predictions, MSE = gp.predict(candidates, eval_MSE=True)
+                y_best = np.max(cv_scores)
+                ei = compute_ei(predictions, np.sqrt(MSE), y_best)
+                best_candidate = candidates[np.argmax(ei)]
+
+            else:
+                print('WARNING : acquisition_function not implemented yet : '
+                      + self.acquisition_function)
+
+            dict_candidate = self.vector_to_dict(best_candidate)
+            cv_score = self.score(dict_candidate)
+            if(self.verbose):
+                print ('Step ' + str(i+self.n_init) + ' - Hyperparameter '
+                       + str(dict_candidate) + ' ' + str(cv_score))
+
+            is_in, idx = is_in_ndarray(
+                best_candidate,
+                tested_parameters[:n_tested_parameters, :])
+            if not is_in:
+                tested_parameters[n_tested_parameters, :] = best_candidate
+                cv_scores[n_tested_parameters] = cv_score
+                n_tested_parameters += 1
+            else:
+                if(self.verbose):
+                    print('Hyperparameter already tesed')
+                cv_scores[idx] = (cv_scores[idx] + cv_score) / 2.
+
+        best_idx = np.argmax(cv_scores[:n_tested_parameters])
+        vector_best_param = tested_parameters[best_idx]
+        best_parameter = self.vector_to_dict(vector_best_param)
+
+        if(self.verbose):
+            print ('\nTested ' + str(n_tested_parameters) + ' parameters')
+            print ('Max cv score ' + str(cv_scores[best_idx]))
+            print ('Best parameter ' + str(tested_parameters[best_idx]))
+            print (best_parameter)
+
+        return tested_parameters[:n_tested_parameters, :], \
+            cv_scores[:n_tested_parameters]

From c6b193cadcf930319a65df1de8b4f88bb0385e25 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Sat, 29 Aug 2015 22:27:37 +0200
Subject: [PATCH 8/9] FIX pep8 conformity

---
 examples/model_selection/gp_search.py | 427 +++++++++++++-------------
 1 file changed, 216 insertions(+), 211 deletions(-)

diff --git a/examples/model_selection/gp_search.py b/examples/model_selection/gp_search.py
index 4b47e645536bb..5005d3a1b023d 100644
--- a/examples/model_selection/gp_search.py
+++ b/examples/model_selection/gp_search.py
@@ -12,230 +12,235 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
-def extend_result(n_tests,tmp_res):
-	res = np.zeros(n_tests)
-	l = len(tmp_res) -1
-	for i in range(n_tests):
-		res[i] = tmp_res[min(i,l)]
 
-	return res
+def extend_result(n_tests, tmp_res):
+    res = np.zeros(n_tests)
+    l = len(tmp_res) - 1
+    for i in range(n_tests):
+        res[i] = tmp_res[min(i, l)]
+
+    return res
 
 
 def test1():
-	iris = load_digits()
-	X, y = iris.data, iris.target
-	clf = RandomForestClassifier(n_estimators=20)
+    iris = load_digits()
+    X, y = iris.data, iris.target
+    clf = RandomForestClassifier(n_estimators=20)
 
-	# specify parameters and distributions to sample from
-	parameters = {"max_depth": ['int',[3, 3]],
-					"max_features": ['int',[1,11]],
-					"min_samples_split": ['int',[1,11]],
-					"min_samples_leaf": ['int',[1,11]],
-					"bootstrap": ['cat',[True, False]],
-					"criterion": ['cat',["gini", "entropy"]]}
+    # specify parameters and distributions to sample from
+    parameters = {"max_depth": ['int', [3, 3]],
+                  "max_features": ['int', [1, 11]],
+                  "min_samples_split": ['int', [1, 11]],
+                  "min_samples_leaf": ['int', [1, 11]],
+                  "bootstrap": ['cat', [True, False]],
+                  "criterion": ['cat', ["gini", "entropy"]]}
 
-	search = GPSearchCV(parameters,estimator=clf,X=X,y=y,n_iter=20)
-	search._fit()
+    search = GPSearchCV(parameters, estimator=clf, X=X, y=y, n_iter=20)
+    search._fit()
 
 
 def test2():
-	parameters = {'kernel' : ['cat',['rbf','poly']],'d' : ['int',[1,3]],'C' : ['float',[1,10]] }
-	def scoring_function(x):
-		return 0.5
+    parameters = {'kernel': ['cat', ['rbf', 'poly']],
+                  'd': ['int', [1, 3]],
+                  'C': ['float', [1, 10]]}
 
-	search = GPSearchCV(parameters,estimator=scoring_function,n_iter=20)
-	search._fit()
+    def scoring_function(x):
+        return 0.5
 
+    search = GPSearchCV(parameters, estimator=scoring_function, n_iter=20)
+    search._fit()
 
-def test3():
-	# Display progress logs on stdout
-	logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
-
-	# Load some categories from the training set
-	categories = [
-	    'alt.atheism',
-	    'talk.religion.misc',
-	]
-	# Uncomment the following to do the analysis on all the categories
-	#categories = None
-
-	print("Loading 20 newsgroups dataset for categories:")
-	print(categories)
-
-	data = fetch_20newsgroups(subset='train', categories=categories)
-	print("%d documents" % len(data.filenames))
-	print("%d categories" % len(data.target_names))
-	print()
-
-	# define a pipeline combining a text feature extractor with a simple
-	# classifier
-	pipeline = Pipeline([
-	    ('vect', CountVectorizer()),
-	    ('tfidf', TfidfTransformer()),
-	    ('clf', SGDClassifier()),
-	])
-
-	# uncommenting more parameters will give better exploring power but will
-	# increase processing time in a combinatorial way
-	parameters = {
-	    'vect__max_df': ['float',[0.5,1.]],
-	    #'vect__max_features': (None, 5000, 10000, 50000),
-	    'vect__ngram_range': ['cat',[(1, 1), (1, 2)]],  # unigrams or bigrams
-	    #'tfidf__use_idf': (True, False),
-	    #'tfidf__norm': ('l1', 'l2'),
-	    'clf__alpha': ['float',[0.000001, 0.00001]],
-	    'clf__penalty': ['cat',['l2', 'elasticnet']]
-	    #'clf__n_iter': (10, 50, 80),
-	}
-
-	search = GPSearchCV(parameters,estimator=pipeline,X=data.data, y=data.target,n_iter=20)
-	search._fit()
-
-
-def gp_vs_random_search(test_name,n_tests,search_lenght,save_data=False):
-	"""
-	Compare GP-based search vs a simple random one
-	Choose test_name in {'iris','text'}
-	"""
-
-	n_iter_search = search_lenght
-
-	if(test_name == 'iris'):
-		iris = load_digits()
-		X, y = iris.data, iris.target
-		pipeline = RandomForestClassifier()
-
-		# specify parameters and distributions to sample from
-		parameters = {"max_depth": ['int',[3, 3]],
-						"max_features": ['int',[1,11]],
-						"min_samples_split": ['int',[1,11]],
-						"min_samples_leaf": ['int',[1,11]],
-						"bootstrap": ['cat',[True, False]],
-						"criterion": ['cat',["gini", "entropy"]]}
-
-	elif(test_name == 'text'):
-		# Display progress logs on stdout
-		logging.basicConfig(level=logging.INFO,
-	                    format='%(asctime)s %(levelname)s %(message)s')
-
-		# Load some categories from the training set
-		categories = [
-		    'alt.atheism',
-		    'talk.religion.misc',
-		]
-		# Uncomment the following to do the analysis on all the categories
-		#categories = None
-		print("Loading 20 newsgroups dataset for categories:")
-		print(categories)
-
-		data = fetch_20newsgroups(subset='train', categories=categories)
-		print("%d documents" % len(data.filenames))
-		print("%d categories" % len(data.target_names))
-
-		X = data.data
-		y = data.target
-
-		# define a pipeline combining a text feature extractor with a simple
-		# classifier
-		pipeline = Pipeline([
-		    ('vect', CountVectorizer()),
-		    ('tfidf', TfidfTransformer()),
-		    ('clf', SGDClassifier()),
-		])
-
-		# uncommenting more parameters will give better exploring power but will
-		# increase processing time in a combinatorial way
-		parameters = {
-			'vect__max_df': ['float',[0.5,1.]],
-			#'vect__max_features': (None, 5000, 10000, 50000),
-			'vect__ngram_range': ['cat',[(1, 1), (1, 2)]],  # unigrams or bigrams
-			#'tfidf__use_idf': (True, False),
-			#'tfidf__norm': ('l1', 'l2'),
-			'clf__alpha': ['float',[0.000001, 0.00001]],
-			'clf__penalty': ['cat',['l2', 'elasticnet']]
-			#'clf__n_iter': (10, 50, 80),
-		}
-
-	else:
-		print('Dataset not available for test')
-
-	# GP UCB search
-	all_gp_ucb_results = []
-	print 'GP_ucb search'
-	for i in range(n_tests):
-		ucb_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
-							acquisition_function='UCB',
-							n_iter=n_iter_search, n_init=20, verbose=False)
-		_,scores = ucb_search._fit()
-
-		max_scores = [scores[0]]
-		print 'Test',i,'-',len(scores),'parameters tested'
-
-		for j in range(1,len(scores)):
-			max_scores.append(max(max_scores[j-1],scores[j]))
-		all_gp_ucb_results.append(extend_result(n_iter_search,max_scores))
-	all_gp_ucb_results = np.asarray(all_gp_ucb_results)
-	print all_gp_ucb_results.shape
-	if(save_data):
-		np.savetxt('gp_ucb_scores.csv',all_gp_ucb_results,delimiter=',')
-
-	# # GP EI search
-	# all_gp_ei_results = []
-	# print 'GP_ei search'
-	# for i in range(n_tests):
-	# 	ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
-	# 						acquisition_function='EI',
-	# 						n_iter=n_iter_search, n_init=20, verbose=False)
-	# 	_,scores = ei_search._fit()
-
-	# 	max_scores = [scores[0]]
-	# 	print 'Test',i,'-',len(scores),'parameters tested'
-
-	# 	for j in range(1,len(scores)):
-	# 		max_scores.append(max(max_scores[j-1],scores[j]))
-	# 	all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
-	# all_gp_ei_results = np.asarray(all_gp_ei_results)
-	# print all_gp_ei_results.shape
-	# if(save_data):
-	# 	np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')
-
-	# Randomized search
-	print 'Random search'
-	all_random_results = []
-	for i in range(n_tests):
-		random_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
-                                  	n_iter=n_iter_search, n_init=n_iter_search, verbose=False)
-		_,scores = random_search._fit()
-
-		max_scores = [scores[0]]
-		print 'Test',i,'-',len(scores),'parameters tested'
-
-		for j in range(1,len(scores)):
-			max_scores.append(max(max_scores[j-1],scores[j]))
-		all_random_results.append(extend_result(n_iter_search,max_scores))
-	all_random_results = np.asarray(all_random_results)
-	if(save_data):
-		np.savetxt('rand_scores.csv',all_random_results,delimiter=',')
-
-	plt.figure()
-	# plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
-	plt.plot(range(n_iter_search),np.mean(all_gp_ucb_results,axis=0),'b',label='GP-UCB')
-	plt.plot(range(n_iter_search),np.mean(all_random_results,axis=0),'g',label='Random')
-	plt.legend(loc=4)
-	plt.title('Test GP vs Random on ' + test_name +' dataset - Average on ' + str(n_tests) + ' trials')
-	plt.xlabel('Iterations')
-	plt.ylabel('Max CV performance')
-	plt.show()
 
+def test3():
+    # Display progress logs on stdout
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    # Load some categories from the training set
+    categories = [
+        'alt.atheism',
+        'talk.religion.misc',
+    ]
+    # Uncomment the following to do the analysis on all the categories
+    # categories = None
+
+    print("Loading 20 newsgroups dataset for categories:")
+    print(categories)
+
+    data = fetch_20newsgroups(subset='train', categories=categories)
+    print("%d documents" % len(data.filenames))
+    print("%d categories" % len(data.target_names))
+    print()
+
+    # define a pipeline combining a text feature extractor with a simple
+    # classifier
+    pipeline = Pipeline([
+        ('vect', CountVectorizer()),
+        ('tfidf', TfidfTransformer()),
+        ('clf', SGDClassifier()),
+    ])
+
+    # uncommenting more parameters will give better exploring power but will
+    # increase processing time in a combinatorial way
+    parameters = {
+        'vect__max_df': ['float', [0.5, 1.]],
+        # 'vect__max_features': (None, 5000, 10000, 50000),
+        'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
+        # 'tfidf__use_idf': (True, False),
+        # 'tfidf__norm': ('l1', 'l2'),
+        'clf__alpha': ['float', [0.000001, 0.00001]],
+        'clf__penalty': ['cat', ['l2', 'elasticnet']]
+        # 'clf__n_iter': (10, 50, 80),
+    }
+
+    search = GPSearchCV(parameters, estimator=pipeline, X=data.data, y=data.target, n_iter=20)
+    search._fit()
+
+
+def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False):
+    """
+    Compare GP-based search vs a simple random one
+    Choose test_name in {'iris', 'text'}
+    """
+
+    n_iter_search = search_lenght
+
+    if(test_name == 'iris'):
+        iris = load_digits()
+        X, y = iris.data, iris.target
+        pipeline = RandomForestClassifier()
+
+        # specify parameters and distributions to sample from
+        parameters = {"max_depth": ['int', [3, 3]],
+                      "max_features": ['int', [1, 11]],
+                      "min_samples_split": ['int', [1, 11]],
+                      "min_samples_leaf": ['int', [1, 11]],
+                      "bootstrap": ['cat', [True, False]],
+                      "criterion": ['cat', ["gini", "entropy"]]}
+
+    elif(test_name == 'text'):
+        # Display progress logs on stdout
+        logging.basicConfig(level=logging.INFO,
+                            format='%(asctime)s %(levelname)s %(message)s')
+
+        # Load some categories from the training set
+        categories = [
+            'alt.atheism',
+            'talk.religion.misc',
+        ]
+        # Uncomment the following to do the analysis on all the categories
+        # categories = None
+        print("Loading 20 newsgroups dataset for categories:")
+        print(categories)
+
+        data = fetch_20newsgroups(subset='train', categories=categories)
+        print("%d documents" % len(data.filenames))
+        print("%d categories" % len(data.target_names))
+
+        X = data.data
+        y = data.target
+
+        # define a pipeline combining a text feature extractor with a simple
+        # classifier
+        pipeline = Pipeline([
+            ('vect', CountVectorizer()),
+            ('tfidf', TfidfTransformer()),
+            ('clf', SGDClassifier()),
+        ])
+
+        # uncommenting more parameters will give better exploring power but will
+        # increase processing time in a combinatorial way
+        parameters = {
+            'vect__max_df': ['float', [0.5, 1.]],
+            # 'vect__max_features': (None, 5000, 10000, 50000),
+            'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
+            # 'tfidf__use_idf': (True, False),
+            # 'tfidf__norm': ('l1', 'l2'),
+            'clf__alpha': ['float', [0.000001, 0.00001]],
+            'clf__penalty': ['cat', ['l2', 'elasticnet']]
+            # 'clf__n_iter': (10, 50, 80),
+        }
+
+    else:
+        print('Dataset not available for test')
+
+    # GP UCB search
+    all_gp_ucb_results = []
+    print 'GP_ucb search'
+    for i in range(n_tests):
+        ucb_search = GPSearchCV(parameters, estimator=pipeline, X=X, y=y,
+                                acquisition_function='UCB',
+                                n_iter=n_iter_search, n_init=20, verbose=False)
+        _, scores = ucb_search._fit()
+
+        max_scores = [scores[0]]
+        print 'Test', i, '-', len(scores), 'parameters tested'
+
+        for j in range(1, len(scores)):
+            max_scores.append(max(max_scores[j-1], scores[j]))
+        all_gp_ucb_results.append(extend_result(n_iter_search, max_scores))
+    all_gp_ucb_results = np.asarray(all_gp_ucb_results)
+    print all_gp_ucb_results.shape
+    if(save_data):
+        np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',')
+
+    # # GP EI search
+    # all_gp_ei_results = []
+    # print 'GP_ei search'
+    # for i in range(n_tests):
+    #   ei_search = GPSearchCV(parameters,estimator=pipeline,X=X,y=y,
+    #                       acquisition_function='EI',
+    #                       n_iter=n_iter_search, n_init=20, verbose=False)
+    #   _,scores = ei_search._fit()
+
+    #   max_scores = [scores[0]]
+    #   print 'Test',i,'-',len(scores),'parameters tested'
+
+    #   for j in range(1,len(scores)):
+    #       max_scores.append(max(max_scores[j-1],scores[j]))
+    #   all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
+    # all_gp_ei_results = np.asarray(all_gp_ei_results)
+    # print all_gp_ei_results.shape
+    # if(save_data):
+    #   np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')
+
+    # Randomized search
+    print 'Random search'
+    all_random_results = []
+    for i in range(n_tests):
+        random_search = GPSearchCV(parameters, estimator=pipeline, X=X, y=y,
+                                   n_iter=n_iter_search, n_init=n_iter_search, verbose=False)
+        _, scores = random_search._fit()
+
+        max_scores = [scores[0]]
+        print 'Test', i, '-', len(scores), 'parameters tested'
+
+        for j in range(1, len(scores)):
+            max_scores.append(max(max_scores[j-1], scores[j]))
+        all_random_results.append(extend_result(n_iter_search, max_scores))
+    all_random_results = np.asarray(all_random_results)
+    if(save_data):
+        np.savetxt('rand_scores.csv', all_random_results, delimiter=',')
+
+    plt.figure()
+    # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
+    plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB')
+    plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random')
+    plt.legend(loc=4)
+    plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials')
+    plt.xlabel('Iterations')
+    plt.ylabel('Max CV performance')
+    plt.show()
 
 
 if __name__ == "__main__":
-	
-	# print 'Routine Test'
-	# test2()
-	test_name = 'text'
-	n_tests = 20
-	search_lenght = 60
-	print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials'
-	gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True)
\ No newline at end of file
+
+    print 'Routine Test'
+    test2()
+    test1()
+
+    # test_name = 'text'
+    # n_tests = 20
+    # search_lenght = 60
+    # print '\nTest GP vs Random on',test_name,'dataset - Average on',n_tests,'trials'
+    # gp_vs_random_search(test_name,n_tests,search_lenght,save_data=True)

From 7aeb5f7bd67c53cc8b0f3bc3d04e12aa69f043e9 Mon Sep 17 00:00:00 2001
From: Sebastien <sdubois.sebastien@gmail.com>
Date: Sat, 29 Aug 2015 22:27:59 +0200
Subject: [PATCH 9/9] Added documentation

---
 sklearn/gp_search.py | 140 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 135 insertions(+), 5 deletions(-)

diff --git a/sklearn/gp_search.py b/sklearn/gp_search.py
index d0529d779a07f..cb667d992d088 100644
--- a/sklearn/gp_search.py
+++ b/sklearn/gp_search.py
@@ -19,7 +19,6 @@
 
 #   UTILS    #
 
-
 def sample_candidates(n_candidates, param_bounds, param_isInt):
 
     n_parameters = param_isInt.shape[0]
@@ -50,6 +49,7 @@ def compute_ei(predictions, sigma, y_best):
     for i in range(ei_array.shape[0]):
         z = (y_best - predictions[i]) / sigma[i]
         ei_array[i] = sigma[i] * (z * norm.cdf(z) + norm.pdf(z))
+
     return ei_array
 
 
@@ -88,11 +88,110 @@ def is_in_ndarray(item, a):
 #    GPSearchCV    #
 class GPSearchCV(object):
     """
+    Parameters
+    ----------
+
+    parameters : dict, parameter space on which to optimize the estimator
+        The keys of the dictionnary should be the names of the parameters,
+        and the values should be lists of length 2; the first element being
+        the type of the parameter ('int', 'float' or 'cat' [for categorical]),
+        and the second element being a list of either the bounds between which
+        to search (for 'int' and 'float') or the values the parameter can take
+        (for 'cat')
+        Example : parameters = {'kernel' :  ['cat', ['rbf', 'poly']],
+                                 'd' : ['int', [1,3]],
+                                 'C' : ['float',[1,10])}
+
+    estimator : 1) sklearn estimator or 2) callable
+        1 : object type that implements the "fit" and "predict" methods,
+        as a classifier or a pipeline
+        2 : a function that computes the output given a dictionnary of
+        parameters. The returned value should be a list of one or more
+        floats if score_format == 'cv', and a float if score_format ==
+        'avg'
+
+    X : array-like, shape = [n_samples, n_features]
+        Training vector, where n_samples in the number of samples and
+        n_features is the number of features.
+
+    y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+        Target relative to X for classification or regression;
+        None for unsupervised learning.
+
+    fit_params : dict, optional
+        Parameters to pass to the fit method.
+
+    scoring : string, callable or None, optional
+        A string (see sklearn's model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+        Default is None.
+
+    cv : integer or cross-validation generator, optional
+        Relevant if the estimator is an sklearn object.
+        If an integer is passed, it is the number of folds.
+        Specific cross-validation objects can be passed, see
+        sklearn.cross_validation module for the list of possible objects
+        Default is 5.
+
+    acquisition function : string, optional
+        Function to maximize in order to choose the next parameter to test.
+        - Simple : maximize the predicted output
+        - UCB : maximize the upper confidence bound
+        - EI : maximizes the expected improvement
+        Default is 'UCB'
+
+    n_iter : int
+        Total number of iterations to perform (including n_init and
+        n_final_iter).
+        Default is 100.
+
+    n_init : int, optional
+        Number of random iterations to perform before the smart search.
+        Default is 30.
+
+    n_final_iter : int, optional
+        Number of final iterations, ie. smart iterations but with
+        acquisition_function == 'Simple'
+        Default is 5.
+
+    n_candidates : int, optional
+        Number of random candidates to sample for each GP iterations
+        Default is 500.
+
+    nugget : float, optional
+        The nugget to set for the Gaussian Process.
+        Default is 1.e-10.
+
+
+    Attributes
+    ----------
+
+    best_parameter_ : dict, the parameter set, from those tested by the
+        method _fit, that maximizes the mean of the cross-validation results.
+
+    tested_parameters_ : ndarray, the parameters tested by _fit
+
+
     Examples
-    --------
-    >>> parameters = {'kernel' :  ['cat', ['rbf','poly']],
-    ...                'd' : ['int', [1,3]],
-    ...                'C' : ['float',[1,10])}
+    -------
+    >>> from sklearn.datasets import load_digits
+    >>> iris = load_digits()
+    >>> X, y = iris.data, iris.target
+    >>> clf = RandomForestClassifier(n_estimators=20)
+    >>> parameters = {"max_depth": ['int', [3, 3]],
+                    "max_features": ['int', [1, 11]],
+                    "min_samples_split": ['int', [1, 11]],
+                    "min_samples_leaf": ['int', [1, 11]],
+                    "bootstrap": ['cat', [True, False]],
+                    "criterion": ['cat', ["gini", "entropy"]]}
+
+    >>> search = GPSearchCV(parameters,
+                            estimator=clf,
+                            X=X,
+                            y=y,
+                            n_iter=20)
+    >>> search._fit()
 
     """
 
@@ -130,6 +229,10 @@ def __init__(self,
         self.X = X
         self.y = y
 
+        self.best_parameter_ = None
+        self.tested_parameters_ = None
+        self.cv_scores_ = None
+
         if(callable(estimator)):
             self._callable_estimator = True
             if(verbose):
@@ -173,6 +276,19 @@ def vector_to_dict(self, vector_parameter):
         return dict_parameter
 
     def score(self, test_parameter):
+        """
+        The score function to call in order to evaluate the quality
+        of the parameter test_parameter
+
+        Parameters
+        ----------
+        tested_parameter : dict, the parameter to test
+
+        Returns
+        -------
+        score : the mean of the CV score
+        """
+
         if not self._callable_estimator:
             cv = check_cv(self.cv, self.X, self.y,
                           classifier=is_classifier(self.estimator))
@@ -196,6 +312,15 @@ def score(self, test_parameter):
         return score
 
     def _fit(self):
+        """
+        Run the hyper-parameter optimization process
+
+        Returns
+        -------
+        tested_parameters_ : ndarray, the parameters tested during the process
+
+        cv_scores_ : array of the mean CV results of the parameters tested
+        """
 
         n_tested_parameters = 0
         tested_parameters = np.zeros((self.n_iter, self.n_parameters))
@@ -280,6 +405,11 @@ def _fit(self):
         vector_best_param = tested_parameters[best_idx]
         best_parameter = self.vector_to_dict(vector_best_param)
 
+        # store
+        self.best_parameter_ = best_parameter
+        self.tested_parameters_ = tested_parameters[:n_tested_parameters, :]
+        self.cv_scores_ = cv_scores[:n_tested_parameters]
+
         if(self.verbose):
             print ('\nTested ' + str(n_tested_parameters) + ' parameters')
             print ('Max cv score ' + str(cv_scores[best_idx]))