scikit-learn · mannby · Jan 19, 2016 · Jan 20, 2016 · Jan 28, 2016 · Feb 23, 2016
diff --git a/doc/tutorial/machine_learning_map/parse_path.py b/doc/tutorial/machine_learning_map/parse_path.py
@@ -83,7 +83,7 @@ def convertToFloat(s, loc, toks):
 
 coordinate = number
 
-#comma or whitespace can separate values all over the place in SVG
+# comma or whitespace can separate values all over the place in SVG
 maybeComma = Optional(Literal(',')).suppress()
 
 coordinateSequence = Sequence(coordinate)

diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -372,8 +372,9 @@ function or **logistic** function:
     >>> logistic.fit(iris_X_train, iris_y_train)
     LogisticRegression(C=100000.0, class_weight=None, dual=False,
               fit_intercept=True, intercept_scaling=1, max_iter=100,
-              multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
-              solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
+              multi_class='ovr', n_jobs=1, n_threads=1, penalty='l2',
+              random_state=None, solver='liblinear', tol=0.0001, verbose=0,
+              warm_start=False)
 
 This is known as :class:`LogisticRegression`.
 

diff --git a/examples/text/document_clustering.py b/examples/text/document_clustering.py
@@ -27,8 +27,8 @@
 Two algorithms are demoed: ordinary k-means and its more scalable cousin
 minibatch k-means.
 
-Additionally, latent semantic analysis can also be used to reduce dimensionality
-and discover latent patterns in the data. 
+Additionally, latent semantic analysis can also be used to reduce
+dimensionality and discover latent patterns in the data.
 
 It can be noted that k-means (and minibatch k-means) are very sensitive to
 feature scaling and that in this case the IDF weighting helps improve the

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -30,7 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils import deprecated
-from ..utils.fixes import frombuffer_empty, bincount
+from ..utils.fixes import frombuffer_empty, bincount, sp_version
 from ..utils.validation import check_is_fitted
 
 __all__ = ['CountVectorizer',
@@ -741,8 +741,19 @@ def _count_vocab(self, raw_documents, fixed_vocab):
             vocabulary.default_factory = vocabulary.__len__
 
         analyze = self.build_analyzer()
-        j_indices = _make_int_array()
-        indptr = _make_int_array()
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # NOTE: long on Windows is only 32 bits
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_long_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_long_array()
+        else:
+            # Sparse arrays only support 32-bit integers
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_int_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
             for feature in analyze(doc):
@@ -760,8 +771,16 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.intc)
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # int_ == "l" (long)
+            # NOTE: long on Windows is only 32 bits
+            j_indices = frombuffer_empty(j_indices, dtype=np.int_)
+            indptr = np.frombuffer(indptr, dtype=np.int_)
+        else:
+            # Sparse arrays only support 32-bit integers
+            j_indices = frombuffer_empty(j_indices, dtype=np.intc)
+            indptr = np.frombuffer(indptr, dtype=np.intc)
         values = np.ones(len(j_indices))
 
         X = sp.csr_matrix((values, j_indices, indptr),
@@ -907,6 +926,17 @@ def _make_int_array():
     return array.array(str("i"))
 
 
+def _make_long_array():
+    """Construct an array.array of a type suitable for large scipy.sparse indices.
+
+    scipy 0.14 and later can construct sparse matrices with 64 bit integer
+    indices.
+
+    NOTE: long on Windows is only 32 bits
+    """
+    return array.array(str("l"))
+
+
 class TfidfTransformer(BaseEstimator, TransformerMixin):
     """Transform a count matrix to a normalized tf or tf-idf representation
 

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -12,6 +12,7 @@
 
 import numbers
 import warnings
+import multiprocessing
 
 import numpy as np
 from scipy import optimize, sparse
@@ -451,7 +452,8 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                              class_weight=None, dual=False, penalty='l2',
                              intercept_scaling=1., multi_class='ovr',
                              random_state=None, check_input=True,
-                             max_squared_sum=None, sample_weight=None):
+                             max_squared_sum=None, sample_weight=None,
+                             n_threads=1):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
 
@@ -564,6 +566,9 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
+    n_threads : int
+        Number of threads to use.
+
     Returns
     -------
     coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
@@ -730,7 +735,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             coef_, intercept_, n_iter_i, = _fit_liblinear(
                 X, target, C, fit_intercept, intercept_scaling, None,
                 penalty, dual, verbose, max_iter, tol, random_state,
-                sample_weight=sample_weight)
+                sample_weight=sample_weight, n_threads=n_threads)
             if fit_intercept:
                 w0 = np.concatenate([coef_.ravel(), intercept_])
             else:
@@ -771,7 +776,8 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
                           verbose=0, solver='lbfgs', penalty='l2',
                           dual=False, intercept_scaling=1.,
                           multi_class='ovr', random_state=None,
-                          max_squared_sum=None, sample_weight=None):
+                          max_squared_sum=None, sample_weight=None,
+                          n_threads=1):
     """Computes scores across logistic_regression_path
 
     Parameters
@@ -874,6 +880,9 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
+    n_threads : int
+        Number of threads to use.
+
     Returns
     -------
     coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
@@ -907,7 +916,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         tol=tol, verbose=verbose, dual=dual, penalty=penalty,
         intercept_scaling=intercept_scaling, random_state=random_state,
         check_input=False, max_squared_sum=max_squared_sum,
-        sample_weight=sample_weight)
+        sample_weight=sample_weight, n_threads=n_threads)
 
     log_reg = LogisticRegression(fit_intercept=fit_intercept)
 
@@ -1073,6 +1082,10 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         Number of CPU cores used during the cross-validation loop. If given
         a value of -1, all cores are used.
 
+    n_threads : int, default: 1
+        Number of CPU cores used for liblinear L1 one-vs-rest for more than
+        2-class classification. If given a value of -1, all cores are used.
+
     Attributes
     ----------
     coef_ : array, shape (n_classes, n_features)
@@ -1119,7 +1132,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
     def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='liblinear', max_iter=100,
-                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
+                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1,
+                 n_threads=1):
 
         self.penalty = penalty
         self.dual = dual
@@ -1135,6 +1149,9 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
         self.verbose = verbose
         self.warm_start = warm_start
         self.n_jobs = n_jobs
+        if n_threads == -1:
+            n_threads = multiprocessing.cpu_count()
+        self.n_threads = n_threads
 
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
@@ -1184,7 +1201,7 @@ def fit(self, X, y, sample_weight=None):
                 X, y, self.C, self.fit_intercept, self.intercept_scaling,
                 self.class_weight, self.penalty, self.dual, self.verbose,
                 self.max_iter, self.tol, self.random_state,
-                sample_weight=sample_weight)
+                sample_weight=sample_weight, n_threads=self.n_threads)
             self.n_iter_ = np.array([n_iter_])
             return self
 
@@ -1238,7 +1255,7 @@ def fit(self, X, y, sample_weight=None):
                       class_weight=self.class_weight, check_input=False,
                       random_state=self.random_state, coef=warm_start_coef_,
                       max_squared_sum=max_squared_sum,
-                      sample_weight=sample_weight)
+                      sample_weight=sample_weight, n_threads=self.n_threads)
             for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
 
         fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
@@ -1444,6 +1461,10 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         The seed of the pseudo random number generator to use when
         shuffling the data.
 
+    n_threads : int, default: 1
+        Number of CPU cores used for liblinear L1 one-vs-rest for more than
+        2-class classification. If given a value of -1, all cores are used.
+
     Attributes
     ----------
     coef_ : array, shape (1, n_features) or (n_classes, n_features)
@@ -1501,7 +1522,7 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
                  penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                  max_iter=100, class_weight=None, n_jobs=1, verbose=0,
                  refit=True, intercept_scaling=1., multi_class='ovr',
-                 random_state=None):
+                 random_state=None, n_threads=1):
         self.Cs = Cs
         self.fit_intercept = fit_intercept
         self.cv = cv
@@ -1512,6 +1533,9 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
         self.max_iter = max_iter
         self.class_weight = class_weight
         self.n_jobs = n_jobs
+        if n_threads == -1:
+            n_threads = multiprocessing.cpu_count()
+        self.n_threads = n_threads
         self.verbose = verbose
         self.solver = solver
         self.refit = refit
@@ -1627,7 +1651,7 @@ def fit(self, X, y, sample_weight=None):
                       intercept_scaling=self.intercept_scaling,
                       random_state=self.random_state,
                       max_squared_sum=max_squared_sum,
-                      sample_weight=sample_weight
+                      sample_weight=sample_weight, n_threads=self.n_threads
                       )
             for label in iter_labels
             for train, test in folds)
@@ -1699,7 +1723,7 @@ def fit(self, X, y, sample_weight=None):
                     verbose=max(0, self.verbose - 1),
                     random_state=self.random_state,
                     check_input=False, max_squared_sum=max_squared_sum,
-                    sample_weight=sample_weight)
+                    sample_weight=sample_weight, n_threads=self.n_threads)
                 w = w[0]
 
             else:

diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py
@@ -179,8 +179,9 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1.,
     ... #doctest: +NORMALIZE_WHITESPACE
     LogisticRegression(C=1.0, class_weight=None, dual=False,
         fit_intercept=True, intercept_scaling=1, max_iter=100,
-        multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
-        solver='sag', tol=0.0001, verbose=0, warm_start=False)
+        multi_class='ovr', n_jobs=1, n_threads=1, penalty='l2',
+        random_state=None, solver='sag', tol=0.0001, verbose=0,
+        warm_start=False)
 
     References
     ----------

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -74,7 +74,7 @@ class BaseLibSVM(six.with_metaclass(ABCMeta, BaseEstimator)):
     @abstractmethod
     def __init__(self, impl, kernel, degree, gamma, coef0,
                  tol, C, nu, epsilon, shrinking, probability, cache_size,
-                 class_weight, verbose, max_iter, random_state):
+                 class_weight, verbose, max_iter, random_state, n_threads=1):
 
         if impl not in LIBSVM_IMPL:  # pragma: no cover
             raise ValueError("impl should be one of %s, %s was given" % (
@@ -101,6 +101,7 @@ def __init__(self, impl, kernel, degree, gamma, coef0,
         self.verbose = verbose
         self.max_iter = max_iter
         self.random_state = random_state
+        self.n_threads = n_threads
 
     @property
     def _pairwise(self):
@@ -253,7 +254,8 @@ def _dense_fit(self, X, y, sample_weight, solver_type, kernel,
                 shrinking=self.shrinking, tol=self.tol,
                 cache_size=self.cache_size, coef0=self.coef0,
                 gamma=self._gamma, epsilon=self.epsilon,
-                max_iter=self.max_iter, random_seed=random_seed)
+                max_iter=self.max_iter, random_seed=random_seed,
+                n_threads=self.n_threads)
 
         self._warn_from_fit_status()
 
@@ -275,7 +277,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel,
                 self.C, self.class_weight_,
                 sample_weight, self.nu, self.cache_size, self.epsilon,
                 int(self.shrinking), int(self.probability), self.max_iter,
-                random_seed)
+                self.n_threads, random_seed)
 
         self._warn_from_fit_status()
 
@@ -506,14 +508,14 @@ class BaseSVC(six.with_metaclass(ABCMeta, BaseLibSVM, ClassifierMixin)):
     @abstractmethod
     def __init__(self, impl, kernel, degree, gamma, coef0, tol, C, nu,
                  shrinking, probability, cache_size, class_weight, verbose,
-                 max_iter, decision_function_shape, random_state):
+                 max_iter, decision_function_shape, random_state, n_threads=1):
         self.decision_function_shape = decision_function_shape
         super(BaseSVC, self).__init__(
             impl=impl, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0,
             tol=tol, C=C, nu=nu, epsilon=0., shrinking=shrinking,
             probability=probability, cache_size=cache_size,
             class_weight=class_weight, verbose=verbose, max_iter=max_iter,
-            random_state=random_state)
+            random_state=random_state, n_threads=n_threads)
 
     def _validate_targets(self, y):
         y_ = column_or_1d(y, warn=True)
@@ -776,7 +778,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
                    penalty, dual, verbose, max_iter, tol,
                    random_state=None, multi_class='ovr',
                    loss='logistic_regression', epsilon=0.1,
-                   sample_weight=None):
+                   sample_weight=None, n_threads=1):
     """Used by Logistic Regression (and CV) and LinearSVC.
 
     Preprocessing is done in this function before supplying it to liblinear.
@@ -854,6 +856,10 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
     sample_weight: array-like, optional
         Weights assigned to each sample.
 
+    n_threads : int, default: 1
+        Number of CPU cores used for liblinear L1 one-vs-rest for more than
+         2-class classification. If given a value of -1, all cores are used.
+
     Returns
     -------
     coef_ : ndarray, shape (n_features, n_features + 1)
@@ -908,7 +914,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
     solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
     raw_coef_, n_iter_ = liblinear.train_wrap(
         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
-        class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
+        class_weight_, max_iter, n_threads, rnd.randint(np.iinfo('i').max),
         epsilon, sample_weight)
     # Regarding rnd.randint(..) in the above signature:
     # seed for srand in range [0..INT_MAX); due to limitations in Numpy

diff --git a/sklearn/svm/liblinear.pxd b/sklearn/svm/liblinear.pxd
@@ -19,7 +19,7 @@ cdef extern from "src/liblinear/linear.h":
 
 cdef extern from "src/liblinear/liblinear_helper.c":
     void copy_w(void *, model *, int)
-    parameter *set_parameter(int, double, double, int, char *, char *, int, int, double)
+    parameter *set_parameter(int, double, double, int, char *, char *, int, int, int, double)
     problem *set_problem (char *, char *, np.npy_intp *, double, char *)
     problem *csr_set_problem (char *values, np.npy_intp *n_indices,
         char *indices, np.npy_intp *n_indptr, char *indptr, char *Y,

diff --git a/sklearn/svm/liblinear.pyx b/sklearn/svm/liblinear.pyx
@@ -14,7 +14,7 @@ np.import_array()
 def train_wrap(X, np.ndarray[np.float64_t, ndim=1, mode='c'] Y,
                bint is_sparse, int solver_type, double eps, double bias,
                double C, np.ndarray[np.float64_t, ndim=1] class_weight,
-               int max_iter, unsigned random_seed, double epsilon,
+               int max_iter, int n_threads, unsigned random_seed, double epsilon,
                np.ndarray[np.float64_t, ndim=1, mode='c'] sample_weight):
     cdef parameter *param
     cdef problem *problem
@@ -42,7 +42,7 @@ def train_wrap(X, np.ndarray[np.float64_t, ndim=1, mode='c'] Y,
         class_weight_label = np.arange(class_weight.shape[0], dtype=np.intc)
     param = set_parameter(solver_type, eps, C, class_weight.shape[0],
                           class_weight_label.data, class_weight.data,
-                          max_iter, random_seed, epsilon)
+                          max_iter, n_threads, random_seed, epsilon)
 
     error_msg = check_parameter(problem, param)
     if error_msg:

diff --git a/sklearn/svm/libsvm.pxd b/sklearn/svm/libsvm.pxd
@@ -45,7 +45,7 @@ cdef extern from "libsvm_helper.c":
     void set_parameter (svm_parameter *, int , int , int , double, double ,
                                   double , double , double , double,
                                   double, int, int, int, char *, char *, int,
-                                  int)
+                                  int, int)
     void set_problem (svm_problem *, char *, char *, char *, np.npy_intp *, int)
 
     svm_model *set_model (svm_parameter *, int, char *, np.npy_intp *,