scikit-learn · maheshakya · Mar 27, 2014 · Mar 30, 2014 · Mar 30, 2014 · Mar 30, 2014
diff --git a/.travis.yml b/.travis.yml
@@ -1,34 +1,18 @@
 language: python
-env:
-    - COVERAGE=--with-coverage
-python:
-    - "2.7"
-    - "2.6"
-    - "3.3"
 virtualenv:
   system_site_packages: true
-before_install:
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then wget http://repo.continuum.io/miniconda/Miniconda-2.2.2-Linux-x86_64.sh -O miniconda.sh ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then chmod +x miniconda.sh ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then ./miniconda.sh -b ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then export PATH=/home/travis/anaconda/bin:$PATH ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda update --yes conda ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda update --yes conda ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda create -n testenv --yes pip python=$TRAVIS_PYTHON_VERSION ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then source activate testenv ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda install --yes numpy scipy nose ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get update -qq ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get install -qq python-scipy python-nose python-pip ; fi
-install:
-    - python setup.py build_ext --inplace
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coverage; fi
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coveralls; fi
-script:
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then
-    -   make test-coverage;
-    - else
-    -   make test;
-    - fi
+env:
+  matrix:
+    - DISTRIB="ubuntu" PYTHON_VERSION="2.7" INSTALL_ATLAS="true"
+      COVERAGE="true"
+    # This environment tests the oldest supported anaconda env
+    - DISTRIB="conda" PYTHON_VERSION="2.6" INSTALL_MKL="false"
+      NUMPY_VERSION="1.6.2" SCIPY_VERSION="0.11.0"
+    # This environment tests the newest supported anaconda env
+    - DISTRIB="conda" PYTHON_VERSION="3.4" INSTALL_MKL="true"
+      NUMPY_VERSION="1.8.1" SCIPY_VERSION="0.13.3"
+install: source continuous_integration/install.sh
+script: bash continuous_integration/test_script.sh
 after_success:
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then coveralls; fi
-
+    - if [[ "$COVERAGE" == "true" ]]; then coveralls; fi
+cache: apt
diff --git a/README.rst b/README.rst
@@ -34,8 +34,10 @@ Important links
 Dependencies
 ============
 
-scikit-learn is tested to work under Python 2.6+ and Python 3.3+
-(using the same codebase thanks to an embedded copy of `six <http://pythonhosted.org/six/>`_).
+scikit-learn is tested to work under Python 2.6, Python 2.7, and Python 3.4.
+(using the same codebase thanks to an embedded copy of
+`six <http://pythonhosted.org/six/>`_). It should also work against Python
+3.3.
 
 The required dependencies to build the software NumPy >= 1.6.1, SciPy >= 0.9
 and a working C/C++ compiler.

diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# This script is meant to be called by the "install" step defined in
+# .travis.yml. See http://docs.travis-ci.com/ for more details.
+# The behavior of the script is controlled by environment variabled defined
+# in the .travis.yml in the top level folder of the project.
+
+set -e
+
+sudo apt-get update -qq
+if [[ "$INSTALL_ATLAS" == "true" ]]; then
+    sudo apt-get install -qq libatlas3gf-base libatlas-dev
+fi
+
+if [[ "$DISTRIB" == "conda" ]]; then
+    # Deactivate the travis-provided virtual environment and setup a
+    # conda-based environment instead
+    deactivate
+
+    # Use the miniconda installer for faster download / install of conda
+    # itself
+    wget http://repo.continuum.io/miniconda/Miniconda-2.2.2-Linux-x86_64.sh \
+        -O miniconda.sh
+    chmod +x miniconda.sh && ./miniconda.sh -b
+    export PATH=/home/travis/anaconda/bin:$PATH
+    conda update --yes conda
+
+    # Configure the conda environment and put it in the path using the
+    # provided versions
+    conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
+        numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION
+    source activate testenv
+
+    if [[ "$INSTALL_MKL" == "true" ]]; then
+        # Make sure that MKL is used
+        conda install --yes mkl
+    else
+        # Make sure that MKL is not used
+        conda remove --yes --features mkl || echo "MKL not installed"
+    fi
+
+elif [[ "$DISTRIB" == "ubuntu" ]]; then
+    # Use standard ubuntu packages in their default version
+    sudo apt-get install -qq python-scipy python-nose python-pip
+fi
+
+if [[ "$COVERAGE" == "true" ]]; then
+    pip install coverage coveralls
+fi
diff --git a/continuous_integration/test_script.sh b/continuous_integration/test_script.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# This script is meant to be called by the "script" step defined in
+# .travis.yml. See http://docs.travis-ci.com/ for more details.
+# The behavior of the script is controlled by environment variabled defined
+# in the .travis.yml in the top level folder of the project.
+
+set -e
+
+python --version
+python -c "import numpy; print('numpy %s' % numpy.__version__)"
+python -c "import scipy; print('scipy %s' % scipy.__version__)"
+python setup.py build_ext --inplace
+
+if [[ "$COVERAGE" == "true" ]]; then
+    export WITH_COVERAGE="--with-coverage"
+else
+    export WITH_COVERAGE=""
+fi
+nosetests -s -v $WITH_COVERAGE sklearn
diff --git a/doc/Makefile b/doc/Makefile
@@ -12,7 +12,7 @@ PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 
-.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex latexpdf changes linkcheck doctest
+.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest
 
 all: html-noplot
 
@@ -22,8 +22,6 @@ help:
 	@echo "  dirhtml   to make HTML files named index.html in directories"
 	@echo "  pickle    to make pickle files"
 	@echo "  json      to make JSON files"
-	@echo "  htmlhelp  to make HTML files and a HTML help project"
-	@echo "  qthelp    to make HTML files and a qthelp project"
 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 	@echo "  changes   to make an overview of all changed/added/deprecated items"
@@ -65,21 +63,6 @@ json:
 	@echo
 	@echo "Build finished; now you can process the JSON files."
 
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scikit-learn.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scikit-learn.qhc"
-
 latex:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo

diff --git a/doc/developers/index.rst b/doc/developers/index.rst
@@ -647,9 +647,11 @@ To summarize, a `__init__` should look like::
         self.param1 = param1
         self.param2 = param2
 
-There should be no logic, and the parameters should not be changed.
-The corresponding logic should be put where the parameters are used. The
-following is wrong::
+There should be no logic, not even input validation,
+and the parameters should not be changed.
+The corresponding logic should be put where the parameters are used,
+typically in ``fit``.
+The following is wrong::
 
     def __init__(self, param1=1, param2=2, param3=3):
         # WRONG: parameters should not be modified
@@ -660,8 +662,9 @@ following is wrong::
         # the argument in the constructor
         self.param3 = param2
 
-Scikit-learn relies on this mechanism to introspect objects to set
-their parameters by cross-validation.
+The reason for postponing the validation is that the same validation
+would have to be performed in ``set_params``,
+which is used in algorithms like ``GridSearchCV``.
 
 Fitting
 ^^^^^^^

diff --git a/doc/index.rst b/doc/index.rst
@@ -161,7 +161,7 @@
     <strong>Applications</strong>: Visualization, Increased efficiency</br>
     <strong>Algorithms</strong>:&nbsp;
 
-:ref:`PCA<PCA>`, :ref:`Isomap<isomap>`, :ref:`non-negative matrix factorization<NMF>`.
+:ref:`PCA<PCA>`, :ref:`feature selection<feature_selection>`, :ref:`non-negative matrix factorization<NMF>`.
 
 .. raw:: html
 

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -118,15 +118,37 @@ K-means
 
 The :class:`KMeans` algorithm clusters data by trying to separate samples
 in n groups of equal variance, minimizing a criterion known as the
-'inertia' of the groups. This algorithm requires the number of clusters to
-be specified. It scales well to large number of samples and has been used
-across a large range of application areas in many different fields. It is
-also equivalent to the expectation-maximization algorithm when setting the
-covariance matrix to be diagonal, equal and small. The K-means algorithm
-aims to choose centroids :math:`C` that minimise the within cluster sum of
-squares objective function with a dataset :math:`X` with :math:`n` samples:
-
-.. math:: J(X, C) = \sum_{i=0}^{n}\min_{\mu_j \in C}(||x_j - \mu_i||^2)
+`inertia<inertia>` or within-cluster sum-of-squares.
+This algorithm requires the number of clusters to be specified.
+It scales well to large number of samples and has been used
+across a large range of application areas in many different fields.
+
+The k-means algorithm divides a set of :math:`N` samples :math:`X`:
+into :math:`K` disjoint clusters :math:`C`,
+each described by the mean :math:`\mu_j` of the samples in the cluster.
+The means are commonly called the cluster "centroids";
+note that they are not, in general, points from :math:`X`,
+although they live in the same space.
+The K-means algorithm aims to choose centroids
+that minimise the *inertia*, or within-cluster sum of squared criterion:
+
+.. math:: \sum_{i=0}^{n}\min_{\mu_j \in C}(||x_j - \mu_i||^2)
+
+Inertia, or the within-cluster sum of squares criterion,
+can be recognized as a measure of how internally coherent clusters are.
+It suffers from various drawbacks:
+
+- Inertia makes the assumption that clusters are convex and isotropic,
+  which is not always the case. It responds poorly to elongated clusters,
+  or manifolds with irregular shapes.
+
+- Inertia is not a normalized metric: we just know that lower values are
+  better and zero is optimal. But in very high-dimensional spaces, Euclidean
+  distances tend to become inflated
+  (this is an instance of the so-called "curse of dimensionality").
+  Running a dimensionality reduction algorithm such as `PCA<PCA>`
+  prior to k-means clustering can alleviate this problem
+  and speed up the computations.
 
 K-means is often referred to as Lloyd's algorithm. In basic terms, the
 algorithm has three steps. The first step chooses the initial centroids, with
@@ -144,7 +166,10 @@ until the centroids do not move significantly.
    :align: right
    :scale: 35
 
-The algorithm can be understood through the concept of `Voronoi diagrams
+K-means is equivalent to the expectation-maximization algorithm
+with a small, all-equal, diagonal covariance matrix.
+
+The algorithm can also be understood through the concept of `Voronoi diagrams
 <https://en.wikipedia.org/wiki/Voronoi_diagram>`_. First the Voronoi diagram of
 the points is calculated using the current centroids. Each segment in the
 Voronoi diagram becomes a separate cluster. Secondly, the centroids are updated
@@ -753,33 +778,6 @@ classes according to some similarity metric.
 
 .. currentmodule:: sklearn.metrics
 
-Inertia
--------
-
-Presentation and usage
-~~~~~~~~~~~~~~~~~~~~~~
-
-TODO: factorize inertia computation out of kmeans and then write me!
-
-
-Advantages
-~~~~~~~~~~
-
-- No need for the ground truth knowledge of the "real" classes.
-
-Drawbacks
-~~~~~~~~~
-
-- Inertia makes the assumption that clusters are convex and isotropic
-  which is not always the case especially of the clusters are manifolds
-  with weird shapes: for instance inertia is a useless metrics to evaluate
-  clustering algorithm that tries to identify nested circles on a 2D plane.
-
-- Inertia is not a normalized metrics: we just know that lower values are
-  better and bounded by zero. One potential solution would be to adjust
-  inertia for random clustering (assuming the number of ground truth classes
-  is known).
-
 
 Adjusted Rand index
 -------------------

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -268,11 +268,11 @@ They also tend to break when the problem is badly conditioned
 
 Elastic Net
 ===========
-:class:`ElasticNet` is a linear model trained with L1 and L2 prior as
-regularizer. This combination allows for learning a sparse model where
+:class:`ElasticNet` is a linear regression model trained with L1 and L2 prior
+as regularizer. This combination allows for learning a sparse model where
 few of the weights are non-zero like :class:`Lasso`, while still maintaining
-the regularization properties of :class:`Ridge`. We control this tradeoff
-using the `l1_ratio` parameter.
+the regularization properties of :class:`Ridge`. We control the convex
+combination of L1 and L2 using the `l1_ratio` parameter.
 
 Elastic-net is useful when there are multiple features which are
 correlated with one another. Lasso is likely to pick one of these

diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
@@ -86,7 +86,7 @@ datasets for classification and the `boston house prices dataset
 A dataset is a dictionary-like object that holds all the data and some
 metadata about the data. This data is stored in the ``.data`` member,
 which is a ``n_samples, n_features`` array. In the case of supervised
-problem, explanatory variables are stored in the ``.target`` member. More
+problem, one or more response variables are stored in the ``.target`` member. More
 details on the different datasets can be found in the :ref:`dedicated
 section <datasets>`.
 

diff --git a/doc/tutorial/statistical_inference/index.rst b/doc/tutorial/statistical_inference/index.rst
@@ -26,18 +26,6 @@ A tutorial on statistical-learning for scientific data processing
 
 .. include:: ../../includes/big_toc_css.rst
 
-.. warning::
-
-    In scikit-learn release 0.9, the import path has changed from
-    `scikits.learn` to `sklearn`. To import with cross-version 
-    compatibility, use::
-
-        try:
-            from sklearn import something
-        except ImportError:
-            from scikits.learn import something
-
-
 .. toctree::
    :maxdepth: 2
 
@@ -47,4 +35,3 @@ A tutorial on statistical-learning for scientific data processing
    unsupervised_learning
    putting_together
    finding_help
-
diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst
@@ -25,7 +25,7 @@ Tutorial setup
 --------------
 
 To get started with this tutorial, you firstly must have the
-*scikit-learn* and all of its requiered dependencies installed.
+*scikit-learn* and all of its required dependencies installed.
 
 Please refer to the `scikit-learn install`_ page for more information
 and for per-system instructions.
@@ -419,7 +419,7 @@ Instead of tweaking the parameters of the various components of the
 chain, it is possible to run an exhaustive search of the best
 parameters on a grid of possible values. We try out all classifiers
 on either words or bigrams, with or without idf, and with a penalty
-parameter of either 100 or 1000 for the linear SVM::
+parameter of either 0.01 or 0.001 for the linear SVM::
 
   >>> from sklearn.grid_search import GridSearchCV
   >>> parameters = {'vect__ngram_range': [(1, 1), (1, 2)],

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -182,6 +182,11 @@ Changelog
      :class:`cluster.WardAgglomeration` when no samples are given,
      rather than returning meaningless clustering.
 
+   - Grid search and cross validation allow NaNs in the input arrays so that
+     preprocessors such as :class:`preprocessing.Imputer
+     <preprocessing.Imputer>` can be trained within the cross validation loop,
+     avoiding potentially skewed results.
+
 
 API changes summary
 -------------------

diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
@@ -54,7 +54,7 @@
 # introspect the images arrays to find the shapes (for plotting)
 n_samples, h, w = lfw_people.images.shape
 
-# fot machine learning we use the 2 data directly (as relative pixel
+# for machine learning we use the 2 data directly (as relative pixel
 # positions info is ignored by this model)
 X = lfw_people.data
 n_features = X.shape[1]