scikit-learn · flaviomartins · Sep 28, 2018 · Oct 1, 2018 · Oct 2, 2018
diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
@@ -92,6 +92,12 @@
 op.add_option("--n-features", type=int, default=10000,
               help="Maximum number of features (dimensions)"
                    " to extract from text.")
+op.add_option("--no-remove",
+              action="store_false", dest="remove_extra", default=True,
+              help="Keep 'headers', 'footers', 'quotes'")
+op.add_option("--metric",
+              dest="metric", type="str", default="euclidean",
+              help="Specify the distance metric to use for KMeans.")
 op.add_option("--verbose",
               action="store_true", dest="verbose", default=False,
               help="Print progress reports inside k-means algorithm.")
@@ -126,8 +132,12 @@ def is_interactive():
 print("Loading 20 newsgroups dataset for categories:")
 print(categories)
 
+remove = ()
+if opts.remove_extra:
+    remove = ('headers', 'footers', 'quotes')
+
 dataset = fetch_20newsgroups(subset='all', categories=categories,
-                             shuffle=True, random_state=42)
+                             remove=remove, shuffle=True, random_state=42)
 
 print("%d documents" % len(dataset.data))
 print("%d categories" % len(dataset.target_names))
@@ -186,10 +196,11 @@ def is_interactive():
 
 if opts.minibatch:
     km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
-                         init_size=1000, batch_size=1000, verbose=opts.verbose)
+                         init_size=1000, batch_size=1000, verbose=opts.verbose,
+                         metric=opts.metric)
 else:
     km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
-                verbose=opts.verbose)
+                verbose=opts.verbose, metric=opts.metric)
 
 print("Clustering sparse data with %s" % km)
 t0 = time()