PERF: Optimize loadtxt usecols.

anntzer · anntzer · commit 45f9118f4597 · 2021-08-17T14:50:12.000+02:00
7-10% speedup in usecols benchmarks; it appears that even in the
single-usecol case, avoiding the iteration over `usecols` more than
compensates the cost of the extra function call to usecols_getter.
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
@@ -993,8 +993,8 @@ def read_data(lineno_words_iter, chunk_size):
         X = []
         for lineno, words in lineno_words_iter:
             if usecols:
-                words = [words[j] for j in usecols]
-            if len(words) != ncols:
+                words = usecols_getter(words)
+            elif len(words) != ncols:
                 raise ValueError(f"Wrong number of columns at line {lineno}")
             # Convert each value according to its column, then pack it
             # according to the dtype's nesting
@@ -1033,23 +1033,25 @@ def read_data(lineno_words_iter, chunk_size):
         byte_converters = True
 
     if usecols is not None:
-        # Allow usecols to be a single int or a sequence of ints
+        # Copy usecols, allowing it to be a single int or a sequence of ints.
         try:
-            usecols_as_list = list(usecols)
+            usecols = list(usecols)
         except TypeError:
-            usecols_as_list = [usecols]
-        for col_idx in usecols_as_list:
+            usecols = [usecols]
+        for i, col_idx in enumerate(usecols):
             try:
-                opindex(col_idx)
+                usecols[i] = opindex(col_idx)  # Cast to builtin int now.
             except TypeError as e:
                 e.args = (
                     "usecols must be an int or a sequence of ints but "
                     "it contains at least one element of type %s" %
                     type(col_idx),
                     )
                 raise
-        # Fall back to existing code
-        usecols = usecols_as_list
+        usecols_getter = (
+            itemgetter(*usecols) if len(usecols) > 1 else
+            # Get an iterable back, even if using a single column.
+            lambda obj, _col=usecols[0]: [obj[_col]])
 
     # Make sure we're dealing with a proper dtype
     dtype = np.dtype(dtype)