From d83472db8ba7e0da6ecb5baae6277437c122ba25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 13 Dec 2017 10:21:06 +0100 Subject: [PATCH 1/2] DOC run three more examples using sys.argv. sphinx-gallery 0.1.12 has the fix for sys.argv bug. --- ...newsgroups.py => plot_document_classification_20newsgroups.py} | 0 .../text/{document_clustering.py => plot_document_clustering.py} | 0 ...g_vs_dict_vectorizer.py => plot_hashing_vs_dict_vectorizer.py} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename examples/text/{document_classification_20newsgroups.py => plot_document_classification_20newsgroups.py} (100%) rename examples/text/{document_clustering.py => plot_document_clustering.py} (100%) rename examples/text/{hashing_vs_dict_vectorizer.py => plot_hashing_vs_dict_vectorizer.py} (100%) diff --git a/examples/text/document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py similarity index 100% rename from examples/text/document_classification_20newsgroups.py rename to examples/text/plot_document_classification_20newsgroups.py diff --git a/examples/text/document_clustering.py b/examples/text/plot_document_clustering.py similarity index 100% rename from examples/text/document_clustering.py rename to examples/text/plot_document_clustering.py diff --git a/examples/text/hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py similarity index 100% rename from examples/text/hashing_vs_dict_vectorizer.py rename to examples/text/plot_hashing_vs_dict_vectorizer.py From 45d5f617e368c189a085f7218f441b7609fd1b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 13 Dec 2017 10:53:13 +0100 Subject: [PATCH 2/2] Fix flake8 in examples --- .../text/plot_document_classification_20newsgroups.py | 2 ++ examples/text/plot_document_clustering.py | 8 +++++--- examples/text/plot_hashing_vs_dict_vectorizer.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 847e17f25bef4..8b9d66f9e09d5 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -88,6 +88,7 @@ def is_interactive(): return not hasattr(sys.modules['__main__'], '__file__') + # work-around for Jupyter notebook and IPython console argv = [] if is_interactive() else sys.argv[1:] (opts, args) = op.parse_args(argv) @@ -136,6 +137,7 @@ def is_interactive(): def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 + data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index a2e9a93b23085..4a2b28f011e9e 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -27,8 +27,8 @@ Two algorithms are demoed: ordinary k-means and its more scalable cousin minibatch k-means. -Additionally, latent semantic analysis can also be used to reduce dimensionality -and discover latent patterns in the data. +Additionally, latent semantic analysis can also be used to reduce +dimensionality and discover latent patterns in the data. It can be noted that k-means (and minibatch k-means) are very sensitive to feature scaling and that in this case the IDF weighting helps improve the @@ -106,6 +106,7 @@ def is_interactive(): return not hasattr(sys.modules['__main__'], '__file__') + # work-around for Jupyter notebook and IPython console argv = [] if is_interactive() else sys.argv[1:] (opts, args) = op.parse_args(argv) @@ -138,7 +139,8 @@ def is_interactive(): labels = dataset.target true_k = np.unique(labels).shape[0] -print("Extracting features from the training dataset using a sparse vectorizer") +print("Extracting features from the training dataset " + "using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py index c437d79246ceb..f2b673012cbc2 100644 --- a/examples/text/plot_hashing_vs_dict_vectorizer.py +++ b/examples/text/plot_hashing_vs_dict_vectorizer.py @@ -62,7 +62,7 @@ def token_freqs(doc): 'talk.religion.misc', ] # Uncomment the following line to use a larger set (11k+ documents) -#categories = None +# categories = None print(__doc__) print("Usage: %s [n_features_for_hashing]" % sys.argv[0])