From c1c26cec05d2c5aeac21f76b14e4a71ca621dab4 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Nov 2016 09:49:22 +1100
Subject: [PATCH 1/2] DOC insert spaces before colons in parameter lists

Complies with numpydoc to improve rendering and automatic quality
assurance such as #7793. Affects listings of Parameters Attributes,
Returns.

Performed with the help of:

    grep -nE '^(    )+[a-zA-Z][a-zA-Z0-9_]*: ' sklearn -R | grep -v -e
    externals -e tests | grep -v -e default: -e else: -e Warning: -e Note:
    -e TRAIN: -e Default: -e True: -e False: -e DOI: -e In: | gsed
    's|\([^:]*\):\([0-9]*\):\([^:]*\):\(.*\)|--- a/\1\n+++ b/\1\n@@ -\2,1
    +\2,1 @@\n-\3:\4\n+\3 :\4|' | git apply --unidiff-zero -
---
 sklearn/base.py                               | 10 +--
 sklearn/calibration.py                        |  2 +-
 sklearn/cluster/_k_means.pyx                  | 30 ++++----
 sklearn/cluster/affinity_propagation_.py      |  2 +-
 sklearn/cluster/birch.py                      |  4 +-
 sklearn/cluster/hierarchical.py               |  2 +-
 sklearn/cluster/k_means_.py                   | 56 +++++++--------
 sklearn/cluster/spectral.py                   |  2 +-
 sklearn/covariance/graph_lasso_.py            |  4 +-
 sklearn/covariance/shrunk_covariance_.py      |  6 +-
 sklearn/cross_decomposition/pls_.py           |  6 +-
 sklearn/cross_validation.py                   |  2 +-
 sklearn/datasets/base.py                      |  8 +--
 sklearn/datasets/california_housing.py        |  2 +-
 sklearn/datasets/rcv1.py                      |  2 +-
 sklearn/datasets/samples_generator.py         |  4 +-
 sklearn/decomposition/dict_learning.py        | 72 +++++++++----------
 sklearn/decomposition/factor_analysis.py      |  8 +--
 sklearn/decomposition/fastica_.py             |  2 +-
 sklearn/decomposition/incremental_pca.py      | 10 +--
 sklearn/decomposition/kernel_pca.py           | 14 ++--
 sklearn/decomposition/nmf.py                  | 14 ++--
 sklearn/decomposition/pca.py                  | 24 +++----
 sklearn/decomposition/sparse_pca.py           |  6 +-
 sklearn/ensemble/gradient_boosting.py         |  2 +-
 sklearn/feature_extraction/image.py           |  8 +--
 sklearn/gaussian_process/gaussian_process.py  |  8 +--
 sklearn/gaussian_process/gpc.py               | 14 ++--
 sklearn/gaussian_process/gpr.py               |  2 +-
 sklearn/gaussian_process/kernels.py           |  8 +--
 sklearn/linear_model/base.py                  |  4 +-
 sklearn/linear_model/huber.py                 |  6 +-
 sklearn/linear_model/least_angle.py           |  2 +-
 sklearn/linear_model/omp.py                   |  2 +-
 sklearn/linear_model/randomized_l1.py         |  2 +-
 sklearn/linear_model/sag.py                   |  8 +--
 sklearn/linear_model/theil_sen.py             |  2 +-
 sklearn/manifold/isomap.py                    |  8 +--
 sklearn/manifold/locally_linear.py            |  8 +--
 sklearn/manifold/spectral_embedding_.py       |  4 +-
 sklearn/manifold/t_sne.py                     |  2 +-
 sklearn/metrics/classification.py             |  2 +-
 sklearn/metrics/cluster/supervised.py         |  2 +-
 sklearn/metrics/cluster/unsupervised.py       |  2 +-
 sklearn/mixture/base.py                       |  2 +-
 sklearn/mixture/bayesian_mixture.py           |  2 +-
 sklearn/mixture/dpgmm.py                      | 12 ++--
 sklearn/mixture/gaussian_mixture.py           |  8 +--
 sklearn/mixture/gmm.py                        |  8 +--
 sklearn/neighbors/base.py                     |  6 +-
 sklearn/neighbors/graph.py                    |  8 +--
 sklearn/neighbors/unsupervised.py             |  2 +-
 .../neural_network/multilayer_perceptron.py   |  6 +-
 sklearn/pipeline.py                           | 10 +--
 sklearn/preprocessing/data.py                 |  6 +-
 sklearn/random_projection.py                  |  2 +-
 sklearn/svm/base.py                           |  2 +-
 sklearn/svm/bounds.py                         |  2 +-
 sklearn/svm/classes.py                        |  2 +-
 sklearn/svm/libsvm.pyx                        |  8 +--
 sklearn/svm/libsvm_sparse.pyx                 |  4 +-
 sklearn/tree/_criterion.pyx                   | 64 ++++++++---------
 sklearn/tree/_splitter.pyx                    | 22 +++---
 sklearn/utils/__init__.py                     |  2 +-
 sklearn/utils/deprecation.py                  |  2 +-
 sklearn/utils/extmath.py                      | 40 +++++------
 sklearn/utils/graph.py                        |  2 +-
 sklearn/utils/graph_shortest_path.pyx         |  4 +-
 sklearn/utils/murmurhash.pyx                  |  6 +-
 sklearn/utils/optimize.py                     |  6 +-
 sklearn/utils/sparsefuncs.py                  | 52 +++++++-------
 sklearn/utils/sparsefuncs_fast.pyx            | 24 +++----
 sklearn/utils/sparsetools/_traversal.pyx      |  4 +-
 73 files changed, 353 insertions(+), 353 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 67a7c61c60e58..162db281d8cf7 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -43,10 +43,10 @@ def clone(estimator, safe=True):
 
     Parameters
     ----------
-    estimator: estimator object, or list, tuple or set of objects
+    estimator : estimator object, or list, tuple or set of objects
         The estimator or group of estimators to be cloned
 
-    safe: boolean, optional
+    safe : boolean, optional
         If safe is false, clone will fall back to a deepcopy on objects
         that are not estimators.
 
@@ -134,10 +134,10 @@ def _pprint(params, offset=0, printer=repr):
 
     Parameters
     ----------
-    params: dict
+    params : dict
         The dictionary to pretty print
 
-    offset: int
+    offset : int
         The offset in characters to add at the begin of each line.
 
     printer:
@@ -510,7 +510,7 @@ def score(self, X, y=None):
 
         Returns
         -------
-        score: float
+        score : float
         """
         pass
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b96799f73d13d..37928817fd5e9 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -80,7 +80,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
     classes_ : array, shape (n_classes)
         The class labels.
 
-    calibrated_classifiers_: list (len() equal to cv or 1 if cv == "prefit")
+    calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit")
         The list of calibrated classifiers, one for each crossvalidation fold,
         which has been fitted on all but the validation fold and calibrated
         on the validation fold.
diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx
index cc830d8152dad..3e91396f5d7bf 100644
--- a/sklearn/cluster/_k_means.pyx
+++ b/sklearn/cluster/_k_means.pyx
@@ -180,24 +180,24 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
     Parameters
     ----------
 
-    X: CSR matrix, dtype float
+    X : CSR matrix, dtype float
         The complete (pre allocated) training set as a CSR matrix.
 
-    centers: array, shape (n_clusters, n_features)
+    centers : array, shape (n_clusters, n_features)
         The cluster centers
 
-    counts: array, shape (n_clusters,)
+    counts : array, shape (n_clusters,)
          The vector in which we keep track of the numbers of elements in a
          cluster
 
     Returns
     -------
-    inertia: float
+    inertia : float
         The inertia of the batch prior to centers update, i.e. the sum
         distances to the closest center for each sample. This is the objective
         function being minimized by the k-means algorithm.
 
-    squared_diff: float
+    squared_diff : float
         The sum of squared update (squared norm of the centers position
         change). If compute_squared_diff is 0, this computation is skipped and
         0.0 is returned instead.
@@ -281,20 +281,20 @@ def _centers_dense(np.ndarray[floating, ndim=2] X,
 
     Parameters
     ----------
-    X: array-like, shape (n_samples, n_features)
+    X : array-like, shape (n_samples, n_features)
 
-    labels: array of integers, shape (n_samples)
+    labels : array of integers, shape (n_samples)
         Current label assignment
 
-    n_clusters: int
+    n_clusters : int
         Number of desired clusters
 
-    distances: array-like, shape (n_samples)
+    distances : array-like, shape (n_samples)
         Distance to closest cluster for each sample.
 
     Returns
     -------
-    centers: array, shape (n_clusters, n_features)
+    centers : array, shape (n_clusters, n_features)
         The resulting centers
     """
     ## TODO: add support for CSR input
@@ -342,20 +342,20 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
 
     Parameters
     ----------
-    X: scipy.sparse.csr_matrix, shape (n_samples, n_features)
+    X : scipy.sparse.csr_matrix, shape (n_samples, n_features)
 
-    labels: array of integers, shape (n_samples)
+    labels : array of integers, shape (n_samples)
         Current label assignment
 
-    n_clusters: int
+    n_clusters : int
         Number of desired clusters
 
-    distances: array-like, shape (n_samples)
+    distances : array-like, shape (n_samples)
         Distance to closest cluster for each sample.
 
     Returns
     -------
-    centers: array, shape (n_clusters, n_features)
+    centers : array, shape (n_clusters, n_features)
         The resulting centers
     """
     cdef int n_features = X.shape[1]
diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
index 758306aa39796..1c9903dc2efe1 100644
--- a/sklearn/cluster/affinity_propagation_.py
+++ b/sklearn/cluster/affinity_propagation_.py
@@ -278,7 +278,7 @@ def fit(self, X, y=None):
         Parameters
         ----------
 
-        X: array-like, shape (n_samples, n_features) or (n_samples, n_samples)
+        X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)
             Data matrix or, if affinity is ``precomputed``, matrix of
             similarities / affinities.
         """
diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py
index 05b618ddb8259..6575ba41a81b0 100644
--- a/sklearn/cluster/birch.py
+++ b/sklearn/cluster/birch.py
@@ -481,7 +481,7 @@ def _get_leaves(self):
 
         Returns
         -------
-        leaves: array-like
+        leaves : array-like
             List of the leaf nodes.
         """
         leaf_ptr = self.dummy_leaf_.next_leaf_
@@ -538,7 +538,7 @@ def predict(self, X):
 
         Returns
         -------
-        labels: ndarray, shape(n_samples)
+        labels : ndarray, shape(n_samples)
             Labelled data.
         """
         X = check_array(X, accept_sparse='csr')
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index 9c3e43e901bf5..9c65c6418d12d 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -116,7 +116,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
         limited use, and the 'parents' output should rather be used.
         This option is valid only when specifying a connectivity matrix.
 
-    return_distance: bool (optional)
+    return_distance : bool (optional)
         If True, return the distance between the clusters.
 
     Returns
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 778f1494371cc..bd48a1c36224a 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -47,20 +47,20 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 
     Parameters
     -----------
-    X: array or sparse matrix, shape (n_samples, n_features)
+    X : array or sparse matrix, shape (n_samples, n_features)
         The data to pick seeds for. To avoid memory copy, the input data
         should be double precision (dtype=np.float64).
 
-    n_clusters: integer
+    n_clusters : integer
         The number of seeds to choose
 
-    x_squared_norms: array, shape (n_samples,)
+    x_squared_norms : array, shape (n_samples,)
         Squared Euclidean norm of each data point.
 
-    random_state: numpy.RandomState
+    random_state : numpy.RandomState
         The generator used to initialize the centers.
 
-    n_local_trials: integer, optional
+    n_local_trials : integer, optional
         The number of seeding trials for each center (except the first),
         of which the one reducing inertia the most is greedily chosen.
         Set to None to make the number of trials depend logarithmically
@@ -267,7 +267,7 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
         The final value of the inertia criterion (sum of squared distances to
         the closest centroid for all observations in the training set).
 
-    best_n_iter: int
+    best_n_iter : int
         Number of iterations corresponding to the best results.
         Returned only if `return_n_iter` is set to True.
 
@@ -409,17 +409,17 @@ def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++',
 
     Parameters
     ----------
-    X: array-like of floats, shape (n_samples, n_features)
+    X : array-like of floats, shape (n_samples, n_features)
         The observations to cluster.
 
-    n_clusters: int
+    n_clusters : int
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    max_iter: int, optional, default 300
+    max_iter : int, optional, default 300
         Maximum number of iterations of the k-means algorithm to run.
 
-    init: {'k-means++', 'random', or ndarray, or a callable}, optional
+    init : {'k-means++', 'random', or ndarray, or a callable}, optional
         Method for initialization, default to 'k-means++':
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -435,33 +435,33 @@ def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++',
         If a callable is passed, it should take arguments X, k and
         and a random state and return an initialization.
 
-    tol: float, optional
+    tol : float, optional
         The relative increment in the results before declaring convergence.
 
-    verbose: boolean, optional
+    verbose : boolean, optional
         Verbosity mode
 
-    x_squared_norms: array
+    x_squared_norms : array
         Precomputed x_squared_norms.
 
     precompute_distances : boolean, default: True
         Precompute distances (faster but takes more memory).
 
-    random_state: integer or numpy.RandomState, optional
+    random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
         number generator.
 
     Returns
     -------
-    centroid: float ndarray with shape (k, n_features)
+    centroid : float ndarray with shape (k, n_features)
         Centroids found at the last iteration of k-means.
 
-    label: integer ndarray with shape (n_samples,)
+    label : integer ndarray with shape (n_samples,)
         label[i] is the code or index of the centroid the
         i'th observation is closest to.
 
-    inertia: float
+    inertia : float
         The final value of the inertia criterion (sum of squared distances to
         the closest centroid for all observations in the training set).
 
@@ -577,26 +577,26 @@ def _labels_inertia(X, x_squared_norms, centers,
 
     Parameters
     ----------
-    X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
+    X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
         The input samples to assign to the labels.
 
-    x_squared_norms: array, shape (n_samples,)
+    x_squared_norms : array, shape (n_samples,)
         Precomputed squared euclidean norm of each data point, to speed up
         computations.
 
-    centers: float array, shape (k, n_features)
+    centers : float array, shape (k, n_features)
         The cluster centers.
 
     precompute_distances : boolean, default: True
         Precompute distances (faster but takes more memory).
 
-    distances: float array, shape (n_samples,)
+    distances : float array, shape (n_samples,)
         Pre-allocated array to be filled in with each sample's distance
         to the closest center.
 
     Returns
     -------
-    labels: int array of shape(n)
+    labels : int array of shape(n)
         The resulting assignment
 
     inertia : float
@@ -628,20 +628,20 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
     Parameters
     ----------
 
-    X: array, shape (n_samples, n_features)
+    X : array, shape (n_samples, n_features)
 
-    k: int
+    k : int
         number of centroids
 
-    init: {'k-means++', 'random' or ndarray or callable} optional
+    init : {'k-means++', 'random' or ndarray or callable} optional
         Method for initialization
 
-    random_state: integer or numpy.RandomState, optional
+    random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
         number generator.
 
-    x_squared_norms:  array, shape (n_samples,), optional
+    x_squared_norms :  array, shape (n_samples,), optional
         Squared euclidean norm of each data point. Pass it if you have it at
         hands already to avoid it being recomputed here. Default: None
 
@@ -653,7 +653,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
 
     Returns
     -------
-    centers: array, shape(k, n_features)
+    centers : array, shape(k, n_features)
     """
     random_state = check_random_state(random_state)
     n_samples = X.shape[0]
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index cc5574eb447c3..8b64ca9a6dd12 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -39,7 +39,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20,
         Maximum number of iterations to attempt in rotation and partition
         matrix search if machine precision convergence is not reached
 
-    random_state: int seed, RandomState instance, or None (default)
+    random_state : int seed, RandomState instance, or None (default)
         A pseudo random number generator used for the initialization of the
         of the rotation matrix
 
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index e166cfe2072b7..bc9b935c69dc7 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -461,7 +461,7 @@ class GraphLassoCV(GraphLasso):
         grid to be used. See the notes in the class docstring for
         more details.
 
-    n_refinements: strictly positive integer
+    n_refinements : strictly positive integer
         The number of times the grid is refined. Not used if explicit
         values of alphas are passed.
 
@@ -492,7 +492,7 @@ class GraphLassoCV(GraphLasso):
     max_iter : integer, optional
         Maximum number of iterations.
 
-    mode: {'cd', 'lars'}
+    mode : {'cd', 'lars'}
         The Lasso solver to use: coordinate descent or LARS. Use LARS for
         very sparse underlying graphs, where number of features is greater
         than number of samples. Elsewhere prefer cd which is more numerically
diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
index 9830d30b5b19a..33d6463a1349d 100644
--- a/sklearn/covariance/shrunk_covariance_.py
+++ b/sklearn/covariance/shrunk_covariance_.py
@@ -168,7 +168,7 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
 
     Returns
     -------
-    shrinkage: float
+    shrinkage : float
         Coefficient in the convex combination used for the computation
         of the shrunk estimate.
 
@@ -496,7 +496,7 @@ class OAS(EmpiricalCovariance):
     store_precision : bool, default=True
         Specify if the estimated precision is stored.
 
-    assume_centered: bool, default=False
+    assume_centered : bool, default=False
         If True, data are not centered before computation.
         Useful when working with data whose mean is almost, but not exactly
         zero.
@@ -545,7 +545,7 @@ def fit(self, X, y=None):
 
         Returns
         -------
-        self: object
+        self : object
             Returns self.
 
         """
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
index f7b6fd10ac7ba..baf61a521edae 100644
--- a/sklearn/cross_decomposition/pls_.py
+++ b/sklearn/cross_decomposition/pls_.py
@@ -153,7 +153,7 @@ class _PLS(six.with_metaclass(ABCMeta), BaseEstimator, TransformerMixin,
 
     mode : "A" classical PLS and "B" CCA. See notes.
 
-    norm_y_weights: boolean, normalize Y weights to one? (default False)
+    norm_y_weights : boolean, normalize Y weights to one? (default False)
 
     algorithm : string, "nipals" or "svd"
         The algorithm used to estimate the weights. It will be called
@@ -195,7 +195,7 @@ class _PLS(six.with_metaclass(ABCMeta), BaseEstimator, TransformerMixin,
     y_rotations_ : array, [q, n_components]
         Y block to latents rotations.
 
-    coef_: array, [p, q]
+    coef_ : array, [p, q]
         The coefficients of the linear model: ``Y = X coef_ + Err``
 
     n_iter_ : array-like
@@ -517,7 +517,7 @@ class PLSRegression(_PLS):
     y_rotations_ : array, [q, n_components]
         Y block to latents rotations.
 
-    coef_: array, [p, q]
+    coef_ : array, [p, q]
         The coefficients of the linear model: ``Y = X coef_ + Err``
 
     n_iter_ : array-like
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 65960aaa9efe0..a4a1e3d65c7ca 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1810,7 +1810,7 @@ def check_cv(cv, X=None, y=None, classifier=False):
 
     Returns
     -------
-    checked_cv: a cross-validation generator instance.
+    checked_cv : a cross-validation generator instance.
         The return value is guaranteed to be a cv generator instance, whatever
         the input type.
     """
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 924abc961548d..b83f9d4985e46 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -149,7 +149,7 @@ def load_files(container_path, description=None, categories=None,
     container_path : string or unicode
         Path to the main folder holding one subfolder per category
 
-    description: string or unicode, optional (default=None)
+    description : string or unicode, optional (default=None)
         A paragraph describing the characteristic of the dataset: its source,
         reference, etc.
 
@@ -169,7 +169,7 @@ def load_files(container_path, description=None, categories=None,
         If not None, encoding to use to decode text files to Unicode if
         load_content is True.
 
-    decode_error: {'strict', 'ignore', 'replace'}, optional
+    decode_error : {'strict', 'ignore', 'replace'}, optional
         Instruction on what to do if a byte sequence is given to analyze that
         contains characters not of the given `encoding`. Passed as keyword
         argument 'errors' to bytes.decode.
@@ -703,12 +703,12 @@ def load_sample_image(image_name):
 
     Parameters
     -----------
-    image_name: {`china.jpg`, `flower.jpg`}
+    image_name : {`china.jpg`, `flower.jpg`}
         The name of the sample image loaded
 
     Returns
     -------
-    img: 3D array
+    img : 3D array
         The image as a numpy array: height x width x color
 
     Examples
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index 427fae7a6ec47..c109fee6185d8 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -59,7 +59,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
         Specify another download and cache folder for the datasets. By default
         all scikit learn data is stored in '~/scikit_learn_data' subfolders.
 
-    download_if_missing: optional, True by default
+    download_if_missing : optional, True by default
         If False, raise a IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
index 4f25528ed0ff2..83b4d223cc361 100644
--- a/sklearn/datasets/rcv1.py
+++ b/sklearn/datasets/rcv1.py
@@ -59,7 +59,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         Specify another download and cache folder for the datasets. By default
         all scikit learn data is stored in '~/scikit_learn_data' subfolders.
 
-    subset: string, 'train', 'test', or 'all', default='all'
+    subset : string, 'train', 'test', or 'all', default='all'
         Select the dataset to load: 'train' for the training set
         (23149 samples), 'test' for the test set (781265 samples),
         'all' for both, with the training samples first if shuffle is False.
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index acd07337541ca..8321159c35ed1 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -584,7 +584,7 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
     n_samples : int, optional (default=100)
         The total number of points generated.
 
-    shuffle: bool, optional (default=True)
+    shuffle : bool, optional (default=True)
         Whether to shuffle the samples.
 
     noise : double or None (default=None)
@@ -1050,7 +1050,7 @@ def make_sparse_coded_signal(n_samples, n_components, n_features,
     n_samples : int
         number of samples to generate
 
-    n_components:  int,
+    n_components :  int,
         number of components in the dictionary
 
     n_features : int
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 7e6a136f3d651..4a51deb52be0c 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -33,21 +33,21 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
 
     Parameters
     ----------
-    X: array of shape (n_samples, n_features)
+    X : array of shape (n_samples, n_features)
         Data matrix.
 
-    dictionary: array of shape (n_components, n_features)
+    dictionary : array of shape (n_components, n_features)
         The dictionary matrix against which to solve the sparse coding of
         the data. Some of the algorithms assume normalized rows.
 
-    gram: None | array, shape=(n_components, n_components)
+    gram : None | array, shape=(n_components, n_components)
         Precomputed Gram matrix, dictionary * dictionary'
         gram can be None if method is 'threshold'.
 
-    cov: array, shape=(n_components, n_samples)
+    cov : array, shape=(n_components, n_samples)
         Precomputed covariance, dictionary * X'
 
-    algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
+    algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
         lars: uses the least angle regression method (linear_model.lars_path)
         lasso_lars: uses Lars to compute the Lasso solution
         lasso_cd: uses the coordinate descent method to compute the
@@ -62,26 +62,26 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
         algorithm is 'lasso_lars', 'lasso_cd' or 'threshold'.
         Otherwise it corresponds to n_nonzero_coefs.
 
-    init: array of shape (n_samples, n_components)
+    init : array of shape (n_samples, n_components)
         Initialization value of the sparse code. Only used if
         `algorithm='lasso_cd'`.
 
-    max_iter: int, 1000 by default
+    max_iter : int, 1000 by default
         Maximum number of iterations to perform if `algorithm='lasso_cd'`.
 
-    copy_cov: boolean, optional
+    copy_cov : boolean, optional
         Whether to copy the precomputed covariance matrix; if False, it may be
         overwritten.
 
-    check_input: boolean, optional
+    check_input : boolean, optional
         If False, the input arrays X and dictionary will not be checked.
 
-    verbose: int
+    verbose : int
         Controls the verbosity; the higher, the more messages. Defaults to 0.
 
     Returns
     -------
-    code: array of shape (n_components, n_features)
+    code : array of shape (n_components, n_features)
         The sparse codes
 
     See also
@@ -172,21 +172,21 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
 
     Parameters
     ----------
-    X: array of shape (n_samples, n_features)
+    X : array of shape (n_samples, n_features)
         Data matrix
 
-    dictionary: array of shape (n_components, n_features)
+    dictionary : array of shape (n_components, n_features)
         The dictionary matrix against which to solve the sparse coding of
         the data. Some of the algorithms assume normalized rows for meaningful
         output.
 
-    gram: array, shape=(n_components, n_components)
+    gram : array, shape=(n_components, n_components)
         Precomputed Gram matrix, dictionary * dictionary'
 
-    cov: array, shape=(n_components, n_samples)
+    cov : array, shape=(n_components, n_samples)
         Precomputed covariance, dictionary' * X
 
-    algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
+    algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
         lars: uses the least angle regression method (linear_model.lars_path)
         lasso_lars: uses Lars to compute the Lasso solution
         lasso_cd: uses the coordinate descent method to compute the
@@ -196,12 +196,12 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
         threshold: squashes to zero all coefficients less than alpha from
         the projection dictionary * X'
 
-    n_nonzero_coefs: int, 0.1 * n_features by default
+    n_nonzero_coefs : int, 0.1 * n_features by default
         Number of nonzero coefficients to target in each column of the
         solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
         and is overridden by `alpha` in the `omp` case.
 
-    alpha: float, 1. by default
+    alpha : float, 1. by default
         If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
         penalty applied to the L1 norm.
         If `algorithm='threshold'`, `alpha` is the absolute value of the
@@ -210,21 +210,21 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
         the reconstruction error targeted. In this case, it overrides
         `n_nonzero_coefs`.
 
-    init: array of shape (n_samples, n_components)
+    init : array of shape (n_samples, n_components)
         Initialization value of the sparse codes. Only used if
         `algorithm='lasso_cd'`.
 
-    max_iter: int, 1000 by default
+    max_iter : int, 1000 by default
         Maximum number of iterations to perform if `algorithm='lasso_cd'`.
 
-    copy_cov: boolean, optional
+    copy_cov : boolean, optional
         Whether to copy the precomputed covariance matrix; if False, it may be
         overwritten.
 
-    n_jobs: int, optional
+    n_jobs : int, optional
         Number of parallel jobs to run.
 
-    check_input: boolean, optional
+    check_input : boolean, optional
         If False, the input arrays X and dictionary will not be checked.
 
     verbose : int, optional
@@ -232,7 +232,7 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
 
     Returns
     -------
-    code: array of shape (n_samples, n_components)
+    code : array of shape (n_samples, n_components)
         The sparse codes
 
     See also
@@ -309,28 +309,28 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False,
 
     Parameters
     ----------
-    dictionary: array of shape (n_features, n_components)
+    dictionary : array of shape (n_features, n_components)
         Value of the dictionary at the previous iteration.
 
-    Y: array of shape (n_features, n_samples)
+    Y : array of shape (n_features, n_samples)
         Data matrix.
 
-    code: array of shape (n_components, n_samples)
+    code : array of shape (n_components, n_samples)
         Sparse coding of the data against which to optimize the dictionary.
 
     verbose:
         Degree of output the procedure will print.
 
-    return_r2: bool
+    return_r2 : bool
         Whether to compute and return the residual sum of squares corresponding
         to the computed solution.
 
-    random_state: int or RandomState
+    random_state : int or RandomState
         Pseudo number generator state used for random sampling.
 
     Returns
     -------
-    dictionary: array of shape (n_features, n_components)
+    dictionary : array of shape (n_features, n_components)
         Updated dictionary.
 
     """
@@ -569,7 +569,7 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100,
 
     Parameters
     ----------
-    X: array of shape (n_samples, n_features)
+    X : array of shape (n_samples, n_features)
         Data matrix.
 
     n_components : int,
@@ -1048,13 +1048,13 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
         Returns
         -------
-        self: object
+        self : object
             Returns the object itself
         """
         random_state = check_random_state(self.random_state)
@@ -1215,7 +1215,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -1248,11 +1248,11 @@ def partial_fit(self, X, y=None, iter_offset=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
-        iter_offset: integer, optional
+        iter_offset : integer, optional
             The number of iteration on data batches that has been
             performed before this call to partial_fit. This is optional:
             if no number is passed, the memory of the object is
diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py
index 3ce3f21c0a64f..b0bcd0a87d8e2 100644
--- a/sklearn/decomposition/factor_analysis.py
+++ b/sklearn/decomposition/factor_analysis.py
@@ -309,12 +309,12 @@ def score_samples(self, X):
 
         Parameters
         ----------
-        X: array, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
             The data
 
         Returns
         -------
-        ll: array, shape (n_samples,)
+        ll : array, shape (n_samples,)
             Log-likelihood of each sample under the current model
         """
         check_is_fitted(self, 'components_')
@@ -333,12 +333,12 @@ def score(self, X, y=None):
 
         Parameters
         ----------
-        X: array, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
             The data
 
         Returns
         -------
-        ll: float
+        ll : float
             Average log-likelihood of the samples under the current model
         """
         return np.mean(self.score_samples(X))
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index 5c5e1bb755ce8..fbbbbec1b713d 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -191,7 +191,7 @@ def my_g(x):
     max_iter : int, optional
         Maximum number of iterations to perform.
 
-    tol: float, optional
+    tol : float, optional
         A positive scalar giving the tolerance at which the
         un-mixing matrix is considered to have converged.
 
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
index 8f5288beaca32..e1806d1ef7616 100644
--- a/sklearn/decomposition/incremental_pca.py
+++ b/sklearn/decomposition/incremental_pca.py
@@ -153,15 +153,15 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training data, where n_samples is the number of samples and
             n_features is the number of features.
 
-        y: Passthrough for ``Pipeline`` compatibility.
+        y : Passthrough for ``Pipeline`` compatibility.
 
         Returns
         -------
-        self: object
+        self : object
             Returns the instance itself.
         """
         self.components_ = None
@@ -192,13 +192,13 @@ def partial_fit(self, X, y=None, check_input=True):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training data, where n_samples is the number of samples and
             n_features is the number of features.
 
         Returns
         -------
-        self: object
+        self : object
             Returns the instance itself.
         """
         if check_input:
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index 49845e32685e3..1fb6b55f43aaa 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -223,7 +223,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -249,13 +249,13 @@ def fit_transform(self, X, y=None, **params):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         self.fit(X, **params)
 
@@ -271,11 +271,11 @@ def transform(self, X):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         check_is_fitted(self, 'X_fit_')
 
@@ -287,11 +287,11 @@ def inverse_transform(self, X):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_components)
+        X : array-like, shape (n_samples, n_components)
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_features)
+        X_new : array-like, shape (n_samples, n_features)
 
         References
         ----------
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 29707ac94cf70..cf5fc431e6159 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -1007,7 +1007,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
 
         Parameters
         ----------
-        X: {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Data matrix to be decomposed
 
         W : array-like, shape (n_samples, n_components)
@@ -1018,7 +1018,7 @@ def fit_transform(self, X, y=None, W=None, H=None):
 
         Returns
         -------
-        W: array, shape (n_samples, n_components)
+        W : array, shape (n_samples, n_components)
             Transformed data.
         """
         X = check_array(X, accept_sparse=('csr', 'csc'))
@@ -1050,7 +1050,7 @@ def fit(self, X, y=None, **params):
 
         Parameters
         ----------
-        X: {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Data matrix to be decomposed
 
         Returns
@@ -1065,12 +1065,12 @@ def transform(self, X):
 
         Parameters
         ----------
-        X: {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Data matrix to be transformed by the model
 
         Returns
         -------
-        W: array, shape (n_samples, n_components)
+        W : array, shape (n_samples, n_components)
             Transformed data
         """
         check_is_fitted(self, 'n_components_')
@@ -1092,12 +1092,12 @@ def inverse_transform(self, W):
 
         Parameters
         ----------
-        W: {array-like, sparse matrix}, shape (n_samples, n_components)
+        W : {array-like, sparse matrix}, shape (n_samples, n_components)
             Transformed data matrix
 
         Returns
         -------
-        X: {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Data matrix of original shape
 
         .. versionadded:: 0.18
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index f9a4142ee8c19..42fa06406ec47 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -38,18 +38,18 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
 
     Parameters
     ----------
-    spectrum: array of shape (n)
+    spectrum : array of shape (n)
         Data spectrum.
-    rank: int
+    rank : int
         Tested rank value.
-    n_samples: int
+    n_samples : int
         Number of samples.
-    n_features: int
+    n_features : int
         Number of features.
 
     Returns
     -------
-    ll: float,
+    ll : float,
         The log-likelihood
 
     Notes
@@ -307,7 +307,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training data, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -506,12 +506,12 @@ def score_samples(self, X):
 
         Parameters
         ----------
-        X: array, shape(n_samples, n_features)
+        X : array, shape(n_samples, n_features)
             The data.
 
         Returns
         -------
-        ll: array, shape (n_samples,)
+        ll : array, shape (n_samples,)
             Log-likelihood of each sample under the current model
         """
         check_is_fitted(self, 'mean_')
@@ -535,12 +535,12 @@ def score(self, X, y=None):
 
         Parameters
         ----------
-        X: array, shape(n_samples, n_features)
+        X : array, shape(n_samples, n_features)
             The data.
 
         Returns
         -------
-        ll: float
+        ll : float
             Average log-likelihood of the samples under the current model
         """
         return np.mean(self.score_samples(X))
@@ -658,7 +658,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training data, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -675,7 +675,7 @@ def _fit(self, X):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples and
             n_features is the number of features.
 
diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py
index 392704eda80f2..038a2f82b521a 100644
--- a/sklearn/decomposition/sparse_pca.py
+++ b/sklearn/decomposition/sparse_pca.py
@@ -98,7 +98,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
@@ -142,7 +142,7 @@ def transform(self, X, ridge_alpha=None):
 
         Parameters
         ----------
-        X: array of shape (n_samples, n_features)
+        X : array of shape (n_samples, n_features)
             Test data to be transformed, must have the same number of
             features as the data used to train the model.
 
@@ -255,7 +255,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index fcfeb45a09157..2db5b574ade2a 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1537,7 +1537,7 @@ def predict(self, X):
 
         Returns
         -------
-        y: array of shape = ["n_samples]
+        y : array of shape = ["n_samples]
             The predicted values.
         """
         score = self.decision_function(X)
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index f4bfd7e533894..694c624f11110 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -34,11 +34,11 @@ def _make_edges_3d(n_x, n_y, n_z=1):
 
     Parameters
     ===========
-    n_x: integer
+    n_x : integer
         The size of the grid in the x direction.
-    n_y: integer
+    n_y : integer
         The size of the grid in the y direction.
-    n_z: integer, optional
+    n_z : integer, optional
         The size of the grid in the z direction, defaults to 1
     """
     vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
@@ -480,7 +480,7 @@ def transform(self, X):
 
         Returns
         -------
-        patches: array, shape = (n_patches, patch_height, patch_width) or
+        patches : array, shape = (n_patches, patch_height, patch_width) or
              (n_patches, patch_height, patch_width, n_channels)
              The collection of patches extracted from the images, where
              `n_patches` is either `n_samples * max_patches` or the total
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
index c521cb5b52f43..79b11f1b7c2cb 100644
--- a/sklearn/gaussian_process/gaussian_process.py
+++ b/sklearn/gaussian_process/gaussian_process.py
@@ -30,16 +30,16 @@ def l1_cross_distances(X):
     Parameters
     ----------
 
-    X: array_like
+    X : array_like
         An array with shape (n_samples, n_features)
 
     Returns
     -------
 
-    D: array with shape (n_samples * (n_samples - 1) / 2, n_features)
+    D : array with shape (n_samples * (n_samples - 1) / 2, n_features)
         The array of componentwise L1 cross-distances.
 
-    ij: arrays with shape (n_samples * (n_samples - 1) / 2, 2)
+    ij : arrays with shape (n_samples * (n_samples - 1) / 2, 2)
         The indices i and j of the vectors in X associated to the cross-
         distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]).
     """
@@ -169,7 +169,7 @@ class GaussianProcess(BaseEstimator, RegressorMixin):
         exponential distribution (log-uniform on [thetaL, thetaU]).
         Default does not use random starting point (random_start = 1).
 
-    random_state: integer or numpy.RandomState, optional
+    random_state : integer or numpy.RandomState, optional
         The generator used to shuffle the sequence of coordinates of theta in
         the Welch optimizer. If an integer is given, it fixes the seed.
         Defaults to the global numpy random number generator.
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index f0e6c6c439779..bbb1feda98e07 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -116,29 +116,29 @@ def optimizer(obj_func, initial_theta, bounds):
     X_train_ : array-like, shape = (n_samples, n_features)
         Feature values in training data (also required for prediction)
 
-    y_train_: array-like, shape = (n_samples,)
+    y_train_ : array-like, shape = (n_samples,)
         Target values in training data (also required for prediction)
 
     classes_ : array-like, shape = (n_classes,)
         Unique class labels.
 
-    kernel_: kernel object
+    kernel_ : kernel object
         The kernel used for prediction. The structure of the kernel is the
         same as the one passed as parameter but with optimized hyperparameters
 
-    L_: array-like, shape = (n_samples, n_samples)
+    L_ : array-like, shape = (n_samples, n_samples)
         Lower-triangular Cholesky decomposition of the kernel in X_train_
 
-    pi_: array-like, shape = (n_samples,)
+    pi_ : array-like, shape = (n_samples,)
         The probabilities of the positive class for the training points
         X_train_
 
-    W_sr_: array-like, shape = (n_samples,)
+    W_sr_ : array-like, shape = (n_samples,)
         Square root of W, the Hessian of log-likelihood of the latent function
         values for the observed labels. Since W is diagonal, only the diagonal
         of sqrt(W) is stored.
 
-    log_marginal_likelihood_value_: float
+    log_marginal_likelihood_value_ : float
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     """
@@ -515,7 +515,7 @@ def optimizer(obj_func, initial_theta, bounds):
         given, it fixes the seed. Defaults to the global numpy random
         number generator.
 
-    multi_class: string, default : "one_vs_rest"
+    multi_class : string, default : "one_vs_rest"
         Specifies how multi-class classification problems are handled.
         Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
         one binary Gaussian process classifier is fitted for each class, which
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 4f4941fe1d706..ac1b1f6d6254a 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -323,7 +323,7 @@ def sample_y(self, X, n_samples=1, random_state=0):
         n_samples : int, default: 1
             The number of samples drawn from the Gaussian process
 
-        random_state: RandomState or an int seed (0 by default)
+        random_state : RandomState or an int seed (0 by default)
             A random number generator instance
 
         Returns
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index c8466026e7a63..d1a1d6b344574 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -127,7 +127,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
@@ -405,7 +405,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
@@ -542,7 +542,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
@@ -806,7 +806,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 004aeac140b15..1dbb1a4cc77d1 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -376,7 +376,7 @@ def densify(self):
 
         Returns
         -------
-        self: estimator
+        self : estimator
         """
         msg = "Estimator, %(name)s, must be fitted before densifying."
         check_is_fitted(self, "coef_", msg=msg)
@@ -406,7 +406,7 @@ def sparsify(self):
 
         Returns
         -------
-        self: estimator
+        self : estimator
         """
         msg = "Estimator, %(name)s, must be fitted before sparsifying."
         check_is_fitted(self, "coef_", msg=msg)
diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py
index 66b8478ab72e7..e17dc1e61662d 100644
--- a/sklearn/linear_model/huber.py
+++ b/sklearn/linear_model/huber.py
@@ -41,10 +41,10 @@ def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
 
     Returns
     -------
-    loss: float
+    loss : float
         Huber loss.
 
-    gradient: ndarray, shape (len(w))
+    gradient : ndarray, shape (len(w))
         Returns the derivative of the Huber loss with respect to each
         coefficient, intercept and the scale as a vector.
     """
@@ -183,7 +183,7 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
         Number of iterations that fmin_l_bfgs_b has run for.
         Not available if SciPy version is 0.9 and below.
 
-    outliers_: array, shape (n_samples,)
+    outliers_ : array, shape (n_samples,)
         A boolean mask which is set to True where the samples are identified
         as outliers.
 
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 0c64f1e2e2e88..4384cb56535fe 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -1007,7 +1007,7 @@ class LarsCV(Lars):
         calculations. If set to ``'auto'`` let us decide. The Gram
         matrix can also be passed as argument.
 
-    max_iter: integer, optional
+    max_iter : integer, optional
         Maximum number of iterations to perform.
 
     cv : int, cross-validation generator or an iterable, optional
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 41acb4cd4780d..d39f5a26389be 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -711,7 +711,7 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
 
     Returns
     -------
-    residues: array, shape (n_samples, max_features)
+    residues : array, shape (n_samples, max_features)
         Residues of the prediction on the test data
     """
 
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index fa2778cd43eae..537c4c6969872 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -216,7 +216,7 @@ class RandomizedLasso(BaseRandomizedLinearModel):
     n_resampling : int, optional
         Number of randomized models.
 
-    selection_threshold: float, optional
+    selection_threshold : float, optional
         The score above which features should be selected.
 
     fit_intercept : boolean, optional
diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py
index 83ef5bdfdaadf..d09d7ecaaa55b 100644
--- a/sklearn/linear_model/sag.py
+++ b/sklearn/linear_model/sag.py
@@ -108,15 +108,15 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1.,
     alpha : float, optional
         Constant that multiplies the regularization term. Defaults to 1.
 
-    max_iter: int, optional
+    max_iter : int, optional
         The max number of passes over the training data if the stopping
         criteria is not reached. Defaults to 1000.
 
-    tol: double, optional
+    tol : double, optional
         The stopping criteria for the weights. The iterations will stop when
         max(change in weights) / max(weights) < tol. Defaults to .001
 
-    verbose: integer, optional
+    verbose : integer, optional
         The verbosity level.
 
     random_state : int seed, RandomState instance, or None (default)
@@ -131,7 +131,7 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1.,
         going through all the samples. The value should be precomputed
         to speed up cross validation.
 
-    warm_start_mem: dict, optional
+    warm_start_mem : dict, optional
         The initialization parameters used for warm starting. Warm starting is
         currently used in LogisticRegression but not in Ridge.
         It contains:
diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py
index 0764304559ddd..23b3c106c9bd7 100644
--- a/sklearn/linear_model/theil_sen.py
+++ b/sklearn/linear_model/theil_sen.py
@@ -102,7 +102,7 @@ def _spatial_median(X, max_iter=300, tol=1.e-3):
     spatial_median : array, shape = [n_features]
         Spatial median.
 
-    n_iter: int
+    n_iter : int
         Number of iterations needed.
 
     References
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index 5553d6e978575..1f6d0ae0dc0b1 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -169,13 +169,13 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X: {array-like, sparse matrix, BallTree, KDTree}
+        X : {array-like, sparse matrix, BallTree, KDTree}
             Training vector, where n_samples in the number of samples
             and n_features is the number of features.
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         self._fit_transform(X)
         return self.embedding_
@@ -193,11 +193,11 @@ def transform(self, X):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         X = check_array(X)
         distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index f5a383d58a350..367710edc667e 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -28,7 +28,7 @@ def barycenter_weights(X, Z, reg=1e-3):
 
     Z : array-like, shape (n_samples, n_neighbors, n_dim)
 
-    reg: float, optional
+    reg : float, optional
         amount of regularization to add for the problem to be
         well-posed in the case of n_neighbors > n_dim
 
@@ -245,7 +245,7 @@ def locally_linear_embedding(
         Tolerance for modified LLE method.
         Only used if method == 'modified'
 
-    random_state: numpy.RandomState or int, optional
+    random_state : numpy.RandomState or int, optional
         The generator or seed used to determine the starting vector for arpack
         iterations.  Defaults to numpy.random.
 
@@ -568,7 +568,7 @@ class LocallyLinearEmbedding(BaseEstimator, TransformerMixin):
         algorithm to use for nearest neighbors search,
         passed to neighbors.NearestNeighbors instance
 
-    random_state: numpy.RandomState or int, optional
+    random_state : numpy.RandomState or int, optional
         The generator or seed used to determine the starting vector for arpack
         iterations.  Defaults to numpy.random.
 
@@ -662,7 +662,7 @@ def fit_transform(self, X, y=None):
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         self._fit_transform(X)
         return self.embedding_
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index d8a69c402122e..c2fc878693c93 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -494,7 +494,7 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X: array-like, shape (n_samples, n_features)
+        X : array-like, shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples
             and n_features is the number of features.
 
@@ -505,7 +505,7 @@ def fit_transform(self, X, y=None):
 
         Returns
         -------
-        X_new: array-like, shape (n_samples, n_components)
+        X_new : array-like, shape (n_samples, n_components)
         """
         self.fit(X)
         return self.embedding_
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 6d74cf598392f..d7d912d827b3f 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -237,7 +237,7 @@ def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples,
     P : array, shape (n_samples * (n_samples-1) / 2,)
         Condensed joint probability matrix.
 
-    neighbors: int64 array, shape (n_samples, K)
+    neighbors : int64 array, shape (n_samples, K)
         Array with element [i, j] giving the index for the jth
         closest neighbor to point i.
 
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index ee07fa634d080..d04a902369830 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -714,7 +714,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Estimated targets as returned by a classifier.
 
-    beta: float
+    beta : float
         Weight of precision in harmonic mean.
 
     labels : list, optional
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index a5de7d2e6c751..6ec19205043c3 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -389,7 +389,7 @@ def completeness_score(labels_true, labels_pred):
 
     Returns
     -------
-    completeness: float
+    completeness : float
        score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
 
     References
diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index d8e4d0f470deb..1aed8e72a654b 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -226,7 +226,7 @@ def calinski_harabaz_score(X, labels):
 
     Returns
     -------
-    score: float
+    score : float
         The resulting Calinski-Harabaz score.
 
     References
diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py
index ad78183bb16d1..ca48ea0c5c17e 100644
--- a/sklearn/mixture/base.py
+++ b/sklearn/mixture/base.py
@@ -137,7 +137,7 @@ def _initialize_parameters(self, X, random_state):
         ----------
         X : array-like, shape  (n_samples, n_features)
 
-        random_state: RandomState
+        random_state : RandomState
             A random number generator instance.
         """
         n_samples, _ = X.shape
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 7112cfc1786e1..497b339a4f807 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -163,7 +163,7 @@ class BayesianGaussianMixture(BaseMixture):
                 (n_features)             if 'diag',
                 float                    if 'spherical'
 
-    random_state: RandomState or an int seed, defaults to None.
+    random_state : RandomState or an int seed, defaults to None.
         A random number generator instance.
 
     warm_start : bool, default to False.
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
index bbbf0b9e217f2..b5737ddccf85b 100644
--- a/sklearn/mixture/dpgmm.py
+++ b/sklearn/mixture/dpgmm.py
@@ -146,14 +146,14 @@ class _DPGMMBase(_GMMBase):
 
     Parameters
     ----------
-    n_components: int, default 1
+    n_components : int, default 1
         Number of mixture components.
 
-    covariance_type: string, default 'diag'
+    covariance_type : string, default 'diag'
         String describing the type of covariance parameters to
         use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
 
-    alpha: float, default 1
+    alpha : float, default 1
         Real number representing the concentration parameter of
         the dirichlet process. Intuitively, the Dirichlet Process
         is as likely to start a new cluster for a point as it is
@@ -674,14 +674,14 @@ class VBGMM(_DPGMMBase):
 
     Parameters
     ----------
-    n_components: int, default 1
+    n_components : int, default 1
         Number of mixture components.
 
-    covariance_type: string, default 'diag'
+    covariance_type : string, default 'diag'
         String describing the type of covariance parameters to
         use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
 
-    alpha: float, default 1
+    alpha : float, default 1
         Real number representing the concentration parameter of
         the dirichlet distribution. Intuitively, the higher the
         value of alpha the more likely the variational mixture of
diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py
index 4d68df33932fc..0065b82e6d5d2 100644
--- a/sklearn/mixture/gaussian_mixture.py
+++ b/sklearn/mixture/gaussian_mixture.py
@@ -485,11 +485,11 @@ class GaussianMixture(BaseMixture):
         The user-provided initial weights, defaults to None.
         If it None, weights are initialized using the `init_params` method.
 
-    means_init: array-like, shape (n_components, n_features), optional
+    means_init : array-like, shape (n_components, n_features), optional
         The user-provided initial means, defaults to None,
         If it None, means are initialized using the `init_params` method.
 
-    precisions_init: array-like, optional.
+    precisions_init : array-like, optional.
         The user-provided initial precisions (inverse of the covariance
         matrices), defaults to None.
         If it None, precisions are initialized using the 'init_params' method.
@@ -726,7 +726,7 @@ def bic(self, X):
 
         Returns
         -------
-        bic: float
+        bic : float
             The lower the better.
         """
         return (-2 * self.score(X) * X.shape[0] +
@@ -741,7 +741,7 @@ def aic(self, X):
 
         Returns
         -------
-        aic: float
+        aic : float
             The lower the better.
         """
         return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
index 69f182b142590..872fd610fb6af 100644
--- a/sklearn/mixture/gmm.py
+++ b/sklearn/mixture/gmm.py
@@ -152,7 +152,7 @@ class _GMMBase(BaseEstimator):
         use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
         Defaults to 'diag'.
 
-    random_state: RandomState or an int seed (None by default)
+    random_state : RandomState or an int seed (None by default)
         A random number generator instance
 
     min_covar : float, optional
@@ -316,7 +316,7 @@ def score_samples(self, X):
 
         Parameters
         ----------
-        X: array_like, shape (n_samples, n_features)
+        X : array_like, shape (n_samples, n_features)
             List of n_features-dimensional data points. Each row
             corresponds to a single data point.
 
@@ -647,7 +647,7 @@ def bic(self, X):
 
         Returns
         -------
-        bic: float (the lower the better)
+        bic : float (the lower the better)
         """
         return (-2 * self.score(X).sum() +
                 self._n_parameters() * np.log(X.shape[0]))
@@ -662,7 +662,7 @@ def aic(self, X):
 
         Returns
         -------
-        aic: float (the lower the better)
+        aic : float (the lower the better)
         """
         return - 2 * self.score(X).sum() + 2 * self._n_parameters()
 
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index a2339aeb2d388..0cf8bc04ae230 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -60,14 +60,14 @@ def _get_weights(dist, weights):
 
     Parameters
     ===========
-    dist: ndarray
+    dist : ndarray
         The input distances
-    weights: {'uniform', 'distance' or a callable}
+    weights : {'uniform', 'distance' or a callable}
         The kind of weighting used
 
     Returns
     ========
-    weights_arr: array of the same shape as ``dist``
+    weights_arr : array of the same shape as ``dist``
         if ``weights == 'uniform'``, then returns None
     """
     if weights in (None, 'uniform'):
diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py
index 84f8986396fc8..f04596584f2bf 100644
--- a/sklearn/neighbors/graph.py
+++ b/sklearn/neighbors/graph.py
@@ -57,7 +57,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
         The default distance is 'euclidean' ('minkowski' metric with the p
         param equal to 2.)
 
-    include_self: bool, default=False.
+    include_self : bool, default=False.
         Whether or not to mark each sample as the first nearest neighbor to
         itself. If `None`, then True is used for mode='connectivity' and False
         for mode='distance' as this will preserve backwards compatibilty.
@@ -67,7 +67,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric_params: dict, optional
+    metric_params : dict, optional
         additional keyword arguments for the metric function.
 
     n_jobs : int, optional (default = 1)
@@ -132,7 +132,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
         gives a list of available metrics. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
 
-    include_self: bool, default=False
+    include_self : bool, default=False
         Whether or not to mark each sample as the first nearest neighbor to
         itself. If `None`, then True is used for mode='connectivity' and False
         for mode='distance' as this will preserve backwards compatibilty.
@@ -142,7 +142,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric_params: dict, optional
+    metric_params : dict, optional
         additional keyword arguments for the metric function.
 
     n_jobs : int, optional (default = 1)
diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py
index 04d24d23b8df0..770f8f64c0270 100644
--- a/sklearn/neighbors/unsupervised.py
+++ b/sklearn/neighbors/unsupervised.py
@@ -39,7 +39,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
-    p: integer, optional (default = 2)
+    p : integer, optional (default = 2)
         Parameter for the Minkowski metric from
         sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 2b81446d8f871..af324e84f1c39 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -89,7 +89,7 @@ def _forward_pass(self, activations):
 
         Parameters
         ----------
-        activations: list, length = n_layers - 1
+        activations : list, length = n_layers - 1
             The ith element of the list holds the values of the ith layer.
 
         with_output_activation : bool, default True
@@ -149,7 +149,7 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
         y : array-like, shape (n_samples,)
             The target values.
 
-        activations: list, length = n_layers - 1
+        activations : list, length = n_layers - 1
             The ith element of the list holds the values of the ith layer.
 
         deltas : list, length = n_layers - 1
@@ -193,7 +193,7 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
         y : array-like, shape (n_samples,)
             The target values.
 
-        activations: list, length = n_layers - 1
+        activations : list, length = n_layers - 1
              The ith element of the list holds the values of the ith layer.
 
         deltas : list, length = n_layers - 1
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 40efb63bf7e25..d8fa137d703dd 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -157,7 +157,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
@@ -606,14 +606,14 @@ class FeatureUnion(_BasePipeline, TransformerMixin):
 
     Parameters
     ----------
-    transformer_list: list of (string, transformer) tuples
+    transformer_list : list of (string, transformer) tuples
         List of transformer objects to be applied to the data. The first
         half of each tuple is the name of the transformer.
 
-    n_jobs: int, optional
+    n_jobs : int, optional
         Number of jobs to run in parallel (default 1).
 
-    transformer_weights: dict, optional
+    transformer_weights : dict, optional
         Multiplicative weights for features per transformer.
         Keys are transformer names, values the weights.
 
@@ -629,7 +629,7 @@ def get_params(self, deep=True):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 4740d18f5b84e..fe526d9bccaf4 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -552,7 +552,7 @@ def fit(self, X, y=None):
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
-        y: Passthrough for ``Pipeline`` compatibility.
+        y : Passthrough for ``Pipeline`` compatibility.
         """
 
         # Reset internal state before fitting
@@ -576,7 +576,7 @@ def partial_fit(self, X, y=None):
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
-        y: Passthrough for ``Pipeline`` compatibility.
+        y : Passthrough for ``Pipeline`` compatibility.
         """
         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
                         ensure_2d=False, warn_on_dtype=True,
@@ -776,7 +776,7 @@ def partial_fit(self, X, y=None):
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
-        y: Passthrough for ``Pipeline`` compatibility.
+        y : Passthrough for ``Pipeline`` compatibility.
         """
         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
                         ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index b06558f7d5f42..d513d41e9bd67 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -233,7 +233,7 @@ def sparse_random_matrix(n_components, n_features, density='auto',
 
     Returns
     -------
-    components: numpy array or CSR matrix with shape [n_components, n_features]
+    components : array or CSR matrix with shape [n_components, n_features]
         The generated Gaussian random matrix.
 
     See Also
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index c35efc7c86899..d24bfefce4f6d 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -851,7 +851,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
         that the value of this parameter depends on the scale of the target
         variable y. If unsure, set epsilon=0.
 
-    sample_weight: array-like, optional
+    sample_weight : array-like, optional
         Weights assigned to each sample.
 
     Returns
diff --git a/sklearn/svm/bounds.py b/sklearn/svm/bounds.py
index 834661fc5be80..808b3872c6762 100644
--- a/sklearn/svm/bounds.py
+++ b/sklearn/svm/bounds.py
@@ -50,7 +50,7 @@ def l1_min_c(X, y, loss='squared_hinge', fit_intercept=True,
 
     Returns
     -------
-    l1_min_c: float
+    l1_min_c : float
         minimum value for C
     """
     if loss not in ('squared_hinge', 'log'):
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index e10fcbca1a043..1d269a02c9087 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -47,7 +47,7 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     tol : float, optional (default=1e-4)
         Tolerance for stopping criteria.
 
-    multi_class: string, 'ovr' or 'crammer_singer' (default='ovr')
+    multi_class : string, 'ovr' or 'crammer_singer' (default='ovr')
         Determines the multi-class strategy if `y` contains more than
         two classes.
         ``"ovr"`` trains n_classes one-vs-rest classifiers, while ``"crammer_singer"``
diff --git a/sklearn/svm/libsvm.pyx b/sklearn/svm/libsvm.pyx
index 589cf1e8ac150..8607e74a7e92e 100644
--- a/sklearn/svm/libsvm.pyx
+++ b/sklearn/svm/libsvm.pyx
@@ -287,7 +287,7 @@ def predict(np.ndarray[np.float64_t, ndim=2, mode='c'] X,
 
     Parameters
     ----------
-    X: array-like, dtype=float, size=[n_samples, n_features]
+    X : array-like, dtype=float, size=[n_samples, n_features]
     svm_type : {0, 1, 2, 3, 4}
         Type of SVM: C SVC, nu SVC, one class, epsilon SVR, nu SVR
     kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}
@@ -363,7 +363,7 @@ def predict_proba(
 
     Parameters
     ----------
-    X: array-like, dtype=float
+    X : array-like, dtype=float
     kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}
 
     Returns
@@ -477,9 +477,9 @@ def cross_validation(
     Parameters
     ----------
 
-    X: array-like, dtype=float, size=[n_samples, n_features]
+    X : array-like, dtype=float, size=[n_samples, n_features]
 
-    Y: array, dtype=float, size=[n_samples]
+    Y : array, dtype=float, size=[n_samples]
         target vector
 
     svm_type : {0, 1, 2, 3, 4}
diff --git a/sklearn/svm/libsvm_sparse.pyx b/sklearn/svm/libsvm_sparse.pyx
index b06d1207bb6b1..66bddd63848fa 100644
--- a/sklearn/svm/libsvm_sparse.pyx
+++ b/sklearn/svm/libsvm_sparse.pyx
@@ -89,9 +89,9 @@ def libsvm_sparse_train ( int n_features,
     n_features : number of features.
         XXX: can we retrieve this from any other parameter ?
 
-    X: array-like, dtype=float, size=[N, D]
+    X : array-like, dtype=float, size=[N, D]
 
-    Y: array, dtype=float, size=[N]
+    Y : array, dtype=float, size=[N]
         target vector
 
     ...
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e7ad82f6dcd49..521f545bdea26 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -58,21 +58,21 @@ cdef class Criterion:
 
         Parameters
         ----------
-        y: array-like, dtype=DOUBLE_t
+        y : array-like, dtype=DOUBLE_t
             y is a buffer that can store values for n_outputs target variables
-        y_stride: SIZE_t
+        y_stride : SIZE_t
             y_stride is used to index the kth output value as follows:
             y[i, k] = y[i * y_stride + k]
-        sample_weight: array-like, dtype=DOUBLE_t
+        sample_weight : array-like, dtype=DOUBLE_t
             The weight of each sample
-        weighted_n_samples: DOUBLE_t
+        weighted_n_samples : DOUBLE_t
             The total weight of the samples being considered
-        samples: array-like, dtype=DOUBLE_t
+        samples : array-like, dtype=DOUBLE_t
             Indices of the samples in X and y, where samples[start:end]
             correspond to the samples in this node
-        start: SIZE_t
+        start : SIZE_t
             The first sample to be used on this node
-        end: SIZE_t
+        end : SIZE_t
             The last sample used on this node
 
         """
@@ -103,7 +103,7 @@ cdef class Criterion:
 
         Parameters
         ----------
-        new_pos: SIZE_t
+        new_pos : SIZE_t
             New starting index position of the samples in the right child
         """
 
@@ -129,10 +129,10 @@ cdef class Criterion:
 
         Parameters
         ----------
-        impurity_left: double pointer
+        impurity_left : double pointer
             The memory address where the impurity of the left child should be
             stored.
-        impurity_right: double pointer
+        impurity_right : double pointer
             The memory address where the impurity of the right child should be
             stored
         """
@@ -147,7 +147,7 @@ cdef class Criterion:
 
         Parameters
         ----------
-        dest: double pointer
+        dest : double pointer
             The memory address where the node value should be stored.
         """
 
@@ -187,12 +187,12 @@ cdef class Criterion:
 
         Parameters
         ----------
-        impurity: double
+        impurity : double
             The initial impurity of the node before the split
 
         Return
         ------
-        double: improvement in impurity after the split occurs
+        double : improvement in impurity after the split occurs
         """
 
         cdef double impurity_left
@@ -219,9 +219,9 @@ cdef class ClassificationCriterion(Criterion):
 
         Parameters
         ----------
-        n_outputs: SIZE_t
+        n_outputs : SIZE_t
             The number of targets, the dimensionality of the prediction
-        n_classes: numpy.ndarray, dtype=SIZE_t
+        n_classes : numpy.ndarray, dtype=SIZE_t
             The number of unique classes in each target
         """
 
@@ -290,20 +290,20 @@ cdef class ClassificationCriterion(Criterion):
 
         Parameters
         ----------
-        y: array-like, dtype=DOUBLE_t
+        y : array-like, dtype=DOUBLE_t
             The target stored as a buffer for memory efficiency
-        y_stride: SIZE_t
+        y_stride : SIZE_t
             The stride between elements in the buffer, important if there
             are multiple targets (multi-output)
-        sample_weight: array-like, dtype=DTYPE_t
+        sample_weight : array-like, dtype=DTYPE_t
             The weight of each sample
-        weighted_n_samples: SIZE_t
+        weighted_n_samples : SIZE_t
             The total weight of all samples
-        samples: array-like, dtype=SIZE_t
+        samples : array-like, dtype=SIZE_t
             A mask on the samples, showing which ones we want to use
-        start: SIZE_t
+        start : SIZE_t
             The first sample to use in the mask
-        end: SIZE_t
+        end : SIZE_t
             The last sample to use in the mask
         """
 
@@ -399,7 +399,7 @@ cdef class ClassificationCriterion(Criterion):
 
         Parameters
         ----------
-        new_pos: SIZE_t
+        new_pos : SIZE_t
             The new ending position for which to move samples from the right
             child to the left child.
         """
@@ -484,7 +484,7 @@ cdef class ClassificationCriterion(Criterion):
 
         Parameters
         ----------
-        dest: double pointer
+        dest : double pointer
             The memory address which we will save the node value into.
         """
 
@@ -545,9 +545,9 @@ cdef class Entropy(ClassificationCriterion):
 
         Parameters
         ----------
-        impurity_left: double pointer
+        impurity_left : double pointer
             The memory address to save the impurity of the left node
-        impurity_right: double pointer
+        impurity_right : double pointer
             The memory address to save the impurity of the right node
         """
 
@@ -632,9 +632,9 @@ cdef class Gini(ClassificationCriterion):
 
         Parameters
         ----------
-        impurity_left: DTYPE_t
+        impurity_left : DTYPE_t
             The memory address to save the impurity of the left node to
-        impurity_right: DTYPE_t
+        impurity_right : DTYPE_t
             The memory address to save the impurity of the right node to
         """
 
@@ -692,10 +692,10 @@ cdef class RegressionCriterion(Criterion):
 
         Parameters
         ----------
-        n_outputs: SIZE_t
+        n_outputs : SIZE_t
             The number of targets to be predicted
 
-        n_samples: SIZE_t
+        n_samples : SIZE_t
             The total number of samples to fit on
         """
 
@@ -988,10 +988,10 @@ cdef class MAE(RegressionCriterion):
 
         Parameters
         ----------
-        n_outputs: SIZE_t
+        n_outputs : SIZE_t
             The number of targets to be predicted
 
-        n_samples: SIZE_t
+        n_samples : SIZE_t
             The total number of samples to fit on
         """
 
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..5fa7ee553fe2d 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -64,23 +64,23 @@ cdef class Splitter:
         """
         Parameters
         ----------
-        criterion: Criterion
+        criterion : Criterion
             The criterion to measure the quality of a split.
 
-        max_features: SIZE_t
+        max_features : SIZE_t
             The maximal number of randomly selected features which can be
             considered for a split.
 
-        min_samples_leaf: SIZE_t
+        min_samples_leaf : SIZE_t
             The minimal number of samples each leaf can have, where splits
             which would result in having less samples in a leaf are not
             considered.
 
-        min_weight_leaf: double
+        min_weight_leaf : double
             The minimal weight each leaf can have, where the weight is the sum
             of the weights of each sample in it.
 
-        random_state: object
+        random_state : object
             The user inputted random state to be used for pseudo-randomness
         """
 
@@ -127,13 +127,13 @@ cdef class Splitter:
 
         Parameters
         ----------
-        X: object
+        X : object
             This contains the inputs. Usually it is a 2d numpy array.
 
-        y: numpy.ndarray, dtype=DOUBLE_t
+        y : numpy.ndarray, dtype=DOUBLE_t
             This is the vector of targets, or true labels, for the samples
 
-        sample_weight: numpy.ndarray, dtype=DOUBLE_t (optional)
+        sample_weight : numpy.ndarray, dtype=DOUBLE_t (optional)
             The weights of the samples, where higher weighted samples are fit
             closer than lower weight samples. If not provided, all samples
             are assumed to have uniform weight.
@@ -187,11 +187,11 @@ cdef class Splitter:
 
         Parameters
         ----------
-        start: SIZE_t
+        start : SIZE_t
             The index of the first sample to consider
-        end: SIZE_t
+        end : SIZE_t
             The index of the last sample to consider
-        weighted_n_node_samples: numpy.ndarray, dtype=double pointer
+        weighted_n_node_samples : numpy.ndarray, dtype=double pointer
             The total weight of those samples
         """
 
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 21e9e0693d253..ac16ef9ad6263 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -43,7 +43,7 @@ def safe_mask(X, mask):
     X : {array-like, sparse matrix}
         Data on which to apply mask.
 
-    mask: array
+    mask : array
         Mask to be used on X.
 
     Returns
diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py
index 366c4a14c5bc2..aa0caea2ce2b8 100644
--- a/sklearn/utils/deprecation.py
+++ b/sklearn/utils/deprecation.py
@@ -28,7 +28,7 @@ def __init__(self, extra=''):
         """
         Parameters
         ----------
-        extra: string
+        extra : string
           to be added to the deprecation messages
 
         """
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 741601531d975..f0829470b540a 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -196,16 +196,16 @@ def randomized_range_finder(A, size, n_iter,
 
     Parameters
     ----------
-    A: 2D array
+    A : 2D array
         The input data matrix
 
-    size: integer
+    size : integer
         Size of the return array
 
-    n_iter: integer
+    n_iter : integer
         Number of power iterations used to stabilize the result
 
-    power_iteration_normalizer: 'auto' (default), 'QR', 'LU', 'none'
+    power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
         Whether the power iterations are normalized with step-by-step
         QR factorization (the slowest but most accurate), 'none'
         (the fastest but numerically unstable when `n_iter` is large, e.g.
@@ -215,12 +215,12 @@ def randomized_range_finder(A, size, n_iter,
 
         .. versionadded:: 0.18
 
-    random_state: RandomState or an int seed (0 by default)
+    random_state : RandomState or an int seed (0 by default)
         A random number generator instance
 
     Returns
     -------
-    Q: 2D array
+    Q : 2D array
         A (size x size) projection matrix, the range of which
         approximates well the range of the input matrix A.
 
@@ -274,20 +274,20 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
 
     Parameters
     ----------
-    M: ndarray or sparse matrix
+    M : ndarray or sparse matrix
         Matrix to decompose
 
-    n_components: int
+    n_components : int
         Number of singular values and vectors to extract.
 
-    n_oversamples: int (default is 10)
+    n_oversamples : int (default is 10)
         Additional number of random vectors to sample the range of M so as
         to ensure proper conditioning. The total number of random vectors
         used to find the range of M is n_components + n_oversamples. Smaller
         number can improve speed but can negatively impact the quality of
         approximation of singular vectors and singular values.
 
-    n_iter: int or 'auto' (default is 'auto')
+    n_iter : int or 'auto' (default is 'auto')
         Number of power iterations. It can be used to deal with very noisy
         problems. When 'auto', it is set to 4, unless `n_components` is small
         (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
@@ -295,7 +295,7 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
 
         .. versionchanged:: 0.18
 
-    power_iteration_normalizer: 'auto' (default), 'QR', 'LU', 'none'
+    power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
         Whether the power iterations are normalized with step-by-step
         QR factorization (the slowest but most accurate), 'none'
         (the fastest but numerically unstable when `n_iter` is large, e.g.
@@ -305,7 +305,7 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
 
         .. versionadded:: 0.18
 
-    transpose: True, False or 'auto' (default)
+    transpose : True, False or 'auto' (default)
         Whether the algorithm should be applied to M.T instead of M. The
         result should approximately be the same. The 'auto' mode will
         trigger the transposition if M.shape[1] > M.shape[0] since this
@@ -314,13 +314,13 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
 
         .. versionchanged:: 0.18
 
-    flip_sign: boolean, (True by default)
+    flip_sign : boolean, (True by default)
         The output of a singular value decomposition is only unique up to a
         permutation of the signs of the singular vectors. If `flip_sign` is
         set to `True`, the sign ambiguity is resolved by making the largest
         loadings for each component in the left singular vectors positive.
 
-    random_state: RandomState or an int seed (0 by default)
+    random_state : RandomState or an int seed (0 by default)
         A random number generator instance to make behavior
 
     Notes
@@ -655,15 +655,15 @@ def log_logistic(X, out=None):
 
     Parameters
     ----------
-    X: array-like, shape (M, N) or (M, )
+    X : array-like, shape (M, N) or (M, )
         Argument to the logistic function
 
-    out: array-like, shape: (M, N) or (M, ), optional:
+    out : array-like, shape: (M, N) or (M, ), optional:
         Preallocated output array.
 
     Returns
     -------
-    out: array, shape (M, N) or (M, )
+    out : array, shape (M, N) or (M, )
         Log of the logistic function evaluated at every point in x
 
     Notes
@@ -700,15 +700,15 @@ def softmax(X, copy=True):
 
     Parameters
     ----------
-    X: array-like, shape (M, N)
+    X : array-like, shape (M, N)
         Argument to the logistic function
 
-    copy: bool, optional
+    copy : bool, optional
         Copy X or not.
 
     Returns
     -------
-    out: array, shape (M, N)
+    out : array, shape (M, N)
         Softmax function evaluated at every point in x
     """
     if copy:
diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
index 595e0a7e15408..c87b6c2084f66 100644
--- a/sklearn/utils/graph.py
+++ b/sklearn/utils/graph.py
@@ -28,7 +28,7 @@ def single_source_shortest_path_length(graph, source, cutoff=None):
 
     Parameters
     ----------
-    graph: sparse matrix or 2D array (preferably LIL matrix)
+    graph : sparse matrix or 2D array (preferably LIL matrix)
         Adjacency matrix of the graph
     source : node label
        Starting node for path
diff --git a/sklearn/utils/graph_shortest_path.pyx b/sklearn/utils/graph_shortest_path.pyx
index c21db158468e8..fcf5faeeb33cd 100644
--- a/sklearn/utils/graph_shortest_path.pyx
+++ b/sklearn/utils/graph_shortest_path.pyx
@@ -494,7 +494,7 @@ cdef void dijkstra_directed_one_row(
     graph : array, shape = (N,N)
         on return, graph[i_node] contains the path lengths from
         i_node to each target
-    heap: the Fibonacci heap object to use
+    heap : the Fibonacci heap object to use
     nodes : the array of nodes to use
     """
     cdef unsigned int N = graph.shape[0]
@@ -559,7 +559,7 @@ cdef void dijkstra_one_row(unsigned int i_node,
     graph : array, shape = (N,)
         on return, graph[i_node] contains the path lengths from
         i_node to each target
-    heap: the Fibonacci heap object to use
+    heap : the Fibonacci heap object to use
     nodes : the array of nodes to use
     """
     cdef unsigned int N = graph.shape[0]
diff --git a/sklearn/utils/murmurhash.pyx b/sklearn/utils/murmurhash.pyx
index 486596917b2ec..0b61a5270cda7 100644
--- a/sklearn/utils/murmurhash.pyx
+++ b/sklearn/utils/murmurhash.pyx
@@ -87,13 +87,13 @@ def murmurhash3_32(key, seed=0, positive=False):
 
     Parameters
     ----------
-    key: int32, bytes, unicode or ndarray with dtype int32
+    key : int32, bytes, unicode or ndarray with dtype int32
         the physical object to hash
 
-    seed: int, optional default is 0
+    seed : int, optional default is 0
         integer seed for the hashing algorithm.
 
-    positive: boolean, optional default is False
+    positive : boolean, optional default is False
         True: the results is casted to an unsigned int
           from 0 to 2 ** 32 - 1
         False: the results is casted to a signed int
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index f9cbe8c45a81b..27550454da881 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -133,7 +133,7 @@ def newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4,
     x0 : array of float
         Initial guess.
 
-    args: tuple, optional
+    args : tuple, optional
         Arguments passed to func_grad_hess, func and grad.
 
     tol : float
@@ -147,10 +147,10 @@ def newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4,
     maxinner : int
         Number of CG iterations.
 
-    line_search: boolean
+    line_search : boolean
         Whether to use a line search or not.
 
-    warn: boolean
+    warn : boolean
         Whether to warn when didn't converge.
 
     Returns
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
index 4e3357edbaac1..8515ff2593f31 100644
--- a/sklearn/utils/sparsefuncs.py
+++ b/sklearn/utils/sparsefuncs.py
@@ -67,19 +67,19 @@ def mean_variance_axis(X, axis):
 
     Parameters
     ----------
-    X: CSR or CSC sparse matrix, shape (n_samples, n_features)
+    X : CSR or CSC sparse matrix, shape (n_samples, n_features)
         Input data.
 
-    axis: int (either 0 or 1)
+    axis : int (either 0 or 1)
         Axis along which the axis should be computed.
 
     Returns
     -------
 
-    means: float array with shape (n_features,)
+    means : float array with shape (n_features,)
         Feature-wise means
 
-    variances: float array with shape (n_features,)
+    variances : float array with shape (n_features,)
         Feature-wise variances
 
     """
@@ -110,31 +110,31 @@ def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
 
     Parameters
     ----------
-    X: CSR or CSC sparse matrix, shape (n_samples, n_features)
+    X : CSR or CSC sparse matrix, shape (n_samples, n_features)
         Input data.
 
-    axis: int (either 0 or 1)
+    axis : int (either 0 or 1)
         Axis along which the axis should be computed.
 
-    last_mean: float array with shape (n_features,)
+    last_mean : float array with shape (n_features,)
         Array of feature-wise means to update with the new data X.
 
-    last_var: float array with shape (n_features,)
+    last_var : float array with shape (n_features,)
         Array of feature-wise var to update with the new data X.
 
-    last_n: int
+    last_n : int
         Number of samples seen so far, excluded X.
 
     Returns
     -------
 
-    means: float array with shape (n_features,)
+    means : float array with shape (n_features,)
         Updated feature-wise means.
 
-    variances: float array with shape (n_features,)
+    variances : float array with shape (n_features,)
         Updated feature-wise variances.
 
-    n: int
+    n : int
         Updated number of seen samples.
 
     """
@@ -166,10 +166,10 @@ def inplace_column_scale(X, scale):
 
     Parameters
     ----------
-    X: CSC or CSR matrix with shape (n_samples, n_features)
+    X : CSC or CSR matrix with shape (n_samples, n_features)
         Matrix to normalize using the variance of the features.
 
-    scale: float array with shape (n_features,)
+    scale : float array with shape (n_features,)
         Array of precomputed feature-wise values to use for scaling.
     """
     if isinstance(X, sp.csc_matrix):
@@ -208,13 +208,13 @@ def inplace_swap_row_csc(X, m, n):
 
     Parameters
     ----------
-    X: scipy.sparse.csc_matrix, shape=(n_samples, n_features)
+    X : scipy.sparse.csc_matrix, shape=(n_samples, n_features)
         Matrix whose two rows are to be swapped.
 
-    m: int
+    m : int
         Index of the row of X to be swapped.
 
-    n: int
+    n : int
         Index of the row of X to be swapped.
     """
     for t in [m, n]:
@@ -237,13 +237,13 @@ def inplace_swap_row_csr(X, m, n):
 
     Parameters
     ----------
-    X: scipy.sparse.csr_matrix, shape=(n_samples, n_features)
+    X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
         Matrix whose two rows are to be swapped.
 
-    m: int
+    m : int
         Index of the row of X to be swapped.
 
-    n: int
+    n : int
         Index of the row of X to be swapped.
     """
     for t in [m, n]:
@@ -295,10 +295,10 @@ def inplace_swap_row(X, m, n):
     X : CSR or CSC sparse matrix, shape=(n_samples, n_features)
         Matrix whose two rows are to be swapped.
 
-    m: int
+    m : int
         Index of the row of X to be swapped.
 
-    n: int
+    n : int
         Index of the row of X to be swapped.
     """
     if isinstance(X, sp.csc_matrix):
@@ -318,7 +318,7 @@ def inplace_swap_column(X, m, n):
     X : CSR or CSC sparse matrix, shape=(n_samples, n_features)
         Matrix whose two columns are to be swapped.
 
-    m: int
+    m : int
         Index of the column of X to be swapped.
 
     n : int
@@ -344,16 +344,16 @@ def min_max_axis(X, axis):
     X : CSR or CSC sparse matrix, shape (n_samples, n_features)
         Input data.
 
-    axis: int (either 0 or 1)
+    axis : int (either 0 or 1)
         Axis along which the axis should be computed.
 
     Returns
     -------
 
-    mins: float array with shape (n_features,)
+    mins : float array with shape (n_features,)
         Feature-wise minima
 
-    maxs: float array with shape (n_features,)
+    maxs : float array with shape (n_features,)
         Feature-wise maxima
     """
     if isinstance(X, sp.csr_matrix) or isinstance(X, sp.csc_matrix):
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index 2388fbafac5ab..9ff79c628a1b8 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -56,16 +56,16 @@ def csr_mean_variance_axis0(X):
 
     Parameters
     ----------
-    X: CSR sparse matrix, shape (n_samples, n_features)
+    X : CSR sparse matrix, shape (n_samples, n_features)
         Input data.
 
     Returns
     -------
 
-    means: float array with shape (n_features,)
+    means : float array with shape (n_features,)
         Feature-wise means
 
-    variances: float array with shape (n_features,)
+    variances : float array with shape (n_features,)
         Feature-wise variances
 
     """
@@ -128,16 +128,16 @@ def csc_mean_variance_axis0(X):
 
     Parameters
     ----------
-    X: CSC sparse matrix, shape (n_samples, n_features)
+    X : CSC sparse matrix, shape (n_samples, n_features)
         Input data.
 
     Returns
     -------
 
-    means: float array with shape (n_features,)
+    means : float array with shape (n_features,)
         Feature-wise means
 
-    variances: float array with shape (n_features,)
+    variances : float array with shape (n_features,)
         Feature-wise variances
 
     """
@@ -203,25 +203,25 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
 
     Parameters
     ----------
-    X: CSR or CSC sparse matrix, shape (n_samples, n_features)
+    X : CSR or CSC sparse matrix, shape (n_samples, n_features)
       Input data.
 
-    last_mean: float array with shape (n_features,)
+    last_mean : float array with shape (n_features,)
       Array of feature-wise means to update with the new data X.
 
-    last_var: float array with shape (n_features,)
+    last_var : float array with shape (n_features,)
       Array of feature-wise var to update with the new data X.
 
-    last_n: int
+    last_n : int
       Number of samples seen so far, before X.
 
     Returns
     -------
 
-    updated_mean: float array with shape (n_features,)
+    updated_mean : float array with shape (n_features,)
       Feature-wise means
 
-    updated_variance: float array with shape (n_features,)
+    updated_variance : float array with shape (n_features,)
       Feature-wise variances
 
     updated_n : int
diff --git a/sklearn/utils/sparsetools/_traversal.pyx b/sklearn/utils/sparsetools/_traversal.pyx
index 09a91bd230b85..5dd346307d497 100644
--- a/sklearn/utils/sparsetools/_traversal.pyx
+++ b/sklearn/utils/sparsetools/_traversal.pyx
@@ -60,9 +60,9 @@ def connected_components(csgraph, directed=True, connection='weak',
 
     Returns
     -------
-    n_components: int
+    n_components : int
         The number of connected components.
-    labels: ndarray
+    labels : ndarray
         The length-N array of labels of the connected components.
     """
     if connection.lower() not in ['weak', 'strong']:

From d1e10af007bad8bfc976c0f9bcd54968a96d9b69 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 24 Nov 2016 12:55:30 +1100
Subject: [PATCH 2/2] DOC fix numpydoc format for param

---
 sklearn/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 162db281d8cf7..5d26d7f8e5ec9 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -140,7 +140,7 @@ def _pprint(params, offset=0, printer=repr):
     offset : int
         The offset in characters to add at the begin of each line.
 
-    printer:
+    printer : callable
         The function to convert entries to strings, typically
         the builtin str or repr