From aab030098b5115ca23f7c5871968479b9f46353f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 22 Jun 2016 22:26:34 +0530 Subject: [PATCH] Fix text data tutorial - Fix a typo. - Fix a floating point error in doctests. - Fix `VisibleDepricationWarning` due to conversion of an array with ndim > 0 to an index. - Convert `print` to `log` to avoid doctests failure. Fixes: #6927 Signed-off-by: Rohan Jain --- doc/tutorial/text_analytics/working_with_text_data.rst | 6 +++--- sklearn/datasets/twenty_newsgroups.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst index e6d33f3249bce..d605cd1b34d51 100644 --- a/doc/tutorial/text_analytics/working_with_text_data.rst +++ b/doc/tutorial/text_analytics/working_with_text_data.rst @@ -184,7 +184,7 @@ The most intuitive way to do so is the bags of words representation: The bags of words representation implies that ``n_features`` is the number of distinct words in the corpus: this number is typically -larger that 100,000. +larger than 100,000. If ``n_samples == 10000``, storing ``X`` as a numpy array of type float32 would require 10000 x 100000 x 4 bytes = **4GB in RAM** which @@ -443,13 +443,13 @@ to speed up the computation:: The result of calling ``fit`` on a ``GridSearchCV`` object is a classifier that we can use to ``predict``:: - >>> twenty_train.target_names[gs_clf.predict(['God is love'])] + >>> twenty_train.target_names[gs_clf.predict(['God is love'])[0]] 'soc.religion.christian' The object's ``best_score_`` and ``best_params_`` attributes store the best mean score and the parameters setting corresponding to that score:: - >>> gs_clf.best_score_ + >>> gs_clf.best_score_ # doctest: +ELLIPSIS 0.900... >>> for param_name in sorted(parameters.keys()): ... print("%s: %r" % (param_name, gs_clf.best_params_[param_name])) diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py index a62c1241276d1..524d38d6c1ec1 100644 --- a/sklearn/datasets/twenty_newsgroups.py +++ b/sklearn/datasets/twenty_newsgroups.py @@ -219,7 +219,8 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None, if cache is None: if download_if_missing: - print('Downloading 20news dataset. This may take a few minutes.') + logger.info("Downloading 20news dataset. " + "This may take a few minutes.") cache = download_20newsgroups(target_dir=twenty_home, cache_path=cache_path) else: