From aab030098b5115ca23f7c5871968479b9f46353f Mon Sep 17 00:00:00 2001
From: Rohan Jain <crodjer@gmail.com>
Date: Wed, 22 Jun 2016 22:26:34 +0530
Subject: [PATCH] Fix text data tutorial

 - Fix a typo.
 - Fix a floating point error in doctests.
 - Fix `VisibleDepricationWarning` due to conversion of an array with
   ndim > 0 to an index.
 - Convert `print` to `log` to avoid doctests failure.

Fixes: #6927

Signed-off-by: Rohan Jain <crodjer@gmail.com>
---
 doc/tutorial/text_analytics/working_with_text_data.rst | 6 +++---
 sklearn/datasets/twenty_newsgroups.py                  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst
index e6d33f3249bce..d605cd1b34d51 100644
--- a/doc/tutorial/text_analytics/working_with_text_data.rst
+++ b/doc/tutorial/text_analytics/working_with_text_data.rst
@@ -184,7 +184,7 @@ The most intuitive way to do so is the bags of words representation:
 
 The bags of words representation implies that ``n_features`` is
 the number of distinct words in the corpus: this number is typically
-larger that 100,000.
+larger than 100,000.
 
 If ``n_samples == 10000``, storing ``X`` as a numpy array of type
 float32 would require 10000 x 100000 x 4 bytes = **4GB in RAM** which
@@ -443,13 +443,13 @@ to speed up the computation::
 The result of calling ``fit`` on a ``GridSearchCV`` object is a classifier
 that we can use to ``predict``::
 
-  >>> twenty_train.target_names[gs_clf.predict(['God is love'])]
+  >>> twenty_train.target_names[gs_clf.predict(['God is love'])[0]]
   'soc.religion.christian'
 
 The object's ``best_score_`` and ``best_params_`` attributes store the best
 mean score and the parameters setting corresponding to that score::
 
-  >>> gs_clf.best_score_
+  >>> gs_clf.best_score_                                  # doctest: +ELLIPSIS
   0.900...
   >>> for param_name in sorted(parameters.keys()):
   ...     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py
index a62c1241276d1..524d38d6c1ec1 100644
--- a/sklearn/datasets/twenty_newsgroups.py
+++ b/sklearn/datasets/twenty_newsgroups.py
@@ -219,7 +219,8 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
 
     if cache is None:
         if download_if_missing:
-            print('Downloading 20news dataset. This may take a few minutes.')
+            logger.info("Downloading 20news dataset. "
+                        "This may take a few minutes.")
             cache = download_20newsgroups(target_dir=twenty_home,
                                           cache_path=cache_path)
         else: