From f60797ba64ccf33597225d23b893b6eb11149860 Mon Sep 17 00:00:00 2001
From: Mark Wiebe <mwwiebe@gmail.com>
Date: Sun, 30 Jan 2011 23:26:43 -0800
Subject: [PATCH 1/3] BUG: perf: Operations like "a[10:20] += 10" were doing a
 redundant copy

---
 numpy/core/src/multiarray/ctors.c            | 42 ++++++++++++++++++--
 numpy/core/src/multiarray/new_iterator.c.src |  5 +--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index e0115db8b61a..7ebc83507cce 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -447,7 +447,8 @@ copy_and_swap(void *dst, void *src, int itemsize, npy_intp numitems,
 }
 
 /* Gets a half-open range [start, end) which contains the array data */
-void _get_memory_extents(PyArrayObject *arr,
+NPY_NO_EXPORT void
+_get_array_memory_extents(PyArrayObject *arr,
                     npy_uintp *out_start, npy_uintp *out_end)
 {
     npy_uintp start, end;
@@ -481,12 +482,13 @@ void _get_memory_extents(PyArrayObject *arr,
 }
 
 /* Returns 1 if the arrays have overlapping data, 0 otherwise */
-int _arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
+NPY_NO_EXPORT int
+_arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
 {
     npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
 
-    _get_memory_extents(arr1, &start1, &end1);
-    _get_memory_extents(arr2, &start2, &end2);
+    _get_array_memory_extents(arr1, &start1, &end1);
+    _get_array_memory_extents(arr2, &start2, &end2);
 
     return (start1 < end2) && (start2 < end1);
 }
@@ -506,6 +508,38 @@ int _arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
 NPY_NO_EXPORT int
 PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src)
 {
+    /*
+     * Performance fix for expresions like "a[1000:6000] += x".  In this
+     * case, first an in-place add is done, followed by an assignment,
+     * equivalently expressed like this:
+     *
+     *   tmp = a[1000:6000]   # Calls array_subscript_nice in mapping.c
+     *   np.add(tmp, x, tmp)
+     *   a[1000:6000] = tmp   # Calls array_ass_sub in mapping.c
+     *
+     * In the assignment the underlying data type, shape, strides, and
+     * data pointers are identical, but src != dst because they are separately
+     * generated slices.  By detecting this and skipping the redundant
+     * copy of values to themselves, we potentially give a big speed boost.
+     *
+     * Note that we don't call EquivTypes, because usually the exact same
+     * dtype object will appear, and we don't want to slow things down
+     * with a complicated comparison.  The comparisons are ordered to
+     * try and reject this with as little work as possible.
+     */
+    if (PyArray_DATA(src) == PyArray_DATA(dst) &&
+                        PyArray_DESCR(src) == PyArray_DESCR(dst) &&
+                        PyArray_NDIM(src) == PyArray_NDIM(dst) &&
+                        PyArray_CompareLists(PyArray_DIMS(src),
+                                             PyArray_DIMS(dst),
+                                             PyArray_NDIM(src)) &&
+                        PyArray_CompareLists(PyArray_STRIDES(src),
+                                             PyArray_STRIDES(dst),
+                                             PyArray_NDIM(src))) {
+        /*printf("Redundant copy operation detected\n");*/
+        return 0;
+    }
+
     /*
      * A special case is when there is just one dimension with positive
      * strides, and we pass that to CopyInto, which correctly handles
diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
index 4d031efbd2dc..49d8ad8b5bc8 100644
--- a/numpy/core/src/multiarray/new_iterator.c.src
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -4632,10 +4632,7 @@ npyiter_allocate_arrays(NpyIter *iter,
             npyiter_replace_axisdata(iter, iiter, op[iiter], ondim,
                     PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
 
-            /*
-             * The temporary copy is aligned and needs no cast, and
-             * has constant stride 0 so never needs buffering
-             */
+            /* The temporary copy is aligned and needs no cast */
             op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
             op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
         }

From fcc6cc73ddcb1fc85446ba9256ac24ecdda6c6d8 Mon Sep 17 00:00:00 2001
From: Mark Wiebe <mwwiebe@gmail.com>
Date: Mon, 31 Jan 2011 19:42:19 -0800
Subject: [PATCH 2/3] ENH: core: Small tweaks to streamline things a bit

---
 numpy/core/src/multiarray/convert_datatype.c |  87 +++++----
 numpy/core/src/multiarray/ctors.c            | 175 +++++++++----------
 numpy/core/src/multiarray/mapping.c          |   2 +-
 3 files changed, 129 insertions(+), 135 deletions(-)

diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 27481bafc062..87b4fe7f8acf 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -486,7 +486,9 @@ promote_types(PyArray_Descr *type1, PyArray_Descr *type2,
                         int is_small_unsigned1, int is_small_unsigned2)
 {
     if (is_small_unsigned1) {
-        int type_num1 = type1->type_num, type_num2 = type2->type_num, ret_type_num;
+        int type_num1 = type1->type_num;
+        int type_num2 = type2->type_num;
+        int ret_type_num;
 
         if (type_num2 < NPY_NTYPES && !(PyTypeNum_ISBOOL(type_num2) ||
                                         PyTypeNum_ISUNSIGNED(type_num2))) {
@@ -503,9 +505,9 @@ promote_types(PyArray_Descr *type1, PyArray_Descr *type2,
         return PyArray_PromoteTypes(type1, type2);
     }
     else if (is_small_unsigned2) {
-        int type_num1 = type1->type_num,
-            type_num2 = type2->type_num,
-            ret_type_num;
+        int type_num1 = type1->type_num;
+        int type_num2 = type2->type_num;
+        int ret_type_num;
 
         if (type_num1 < NPY_NTYPES && !(PyTypeNum_ISBOOL(type_num1) ||
                                         PyTypeNum_ISUNSIGNED(type_num1))) {
@@ -536,25 +538,6 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
 {
     int type_num1, type_num2, ret_type_num;
 
-    /* If one of the arguments is NULL, return the non-NULL one */
-    if (type1 == NULL || type2 == NULL) {
-        if (type1 == NULL) {
-            if (type2 == NULL) {
-                PyErr_SetString(PyExc_RuntimeError,
-                        "PromoteTypes received two NULL arguments");
-                return NULL;
-            }
-            else {
-                Py_INCREF(type2);
-                return type2;
-            }
-        }
-        else {
-            Py_INCREF(type1);
-            return type1;
-        }
-    }
-
     type_num1 = type1->type_num;
     type_num2 = type2->type_num;
 
@@ -1146,9 +1129,12 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                 Py_INCREF(ret);
             }
             else {
-                tmpret = PyArray_PromoteTypes(tmp, ret);
-                Py_DECREF(ret);
-                ret = tmpret;
+                /* Only call promote if the types aren't the same dtype */
+                if (tmp != ret || !PyArray_ISNBO(ret->byteorder)) {
+                    tmpret = PyArray_PromoteTypes(tmp, ret);
+                    Py_DECREF(ret);
+                    ret = tmpret;
+                }
             }
         }
     }
@@ -1163,7 +1149,8 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
              * unsigned integer which would fit an a signed integer
              * of the same size, something not exposed in the public API.
              */
-            if (PyArray_NDIM(arr[i]) == 0 && PyTypeNum_ISNUMBER(tmp->type_num)) {
+            if (PyArray_NDIM(arr[i]) == 0 &&
+                                PyTypeNum_ISNUMBER(tmp->type_num)) {
                 char *data = PyArray_BYTES(arr[i]);
                 int swap = !PyArray_ISNBO(tmp->byteorder);
                 int type_num;
@@ -1199,18 +1186,24 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                 printf(" (%d) ", ret_is_small_unsigned);
                 printf("\n");
 #endif
-                tmpret = promote_types(tmp, ret, tmp_is_small_unsigned,
-                                                    ret_is_small_unsigned);
-                if (tmpret == NULL) {
+                /* If they point to the same type, don't call promote */
+                if (tmp == ret && PyArray_ISNBO(tmp->byteorder)) {
+                    Py_DECREF(tmp);
+                }
+                else {
+                    tmpret = promote_types(tmp, ret, tmp_is_small_unsigned,
+                                                        ret_is_small_unsigned);
+                    if (tmpret == NULL) {
+                        Py_DECREF(tmp);
+                        Py_DECREF(ret);
+                        return NULL;
+                    }
                     Py_DECREF(tmp);
                     Py_DECREF(ret);
-                    return NULL;
+                    ret = tmpret;
                 }
                 ret_is_small_unsigned = tmp_is_small_unsigned &&
                                         ret_is_small_unsigned;
-                Py_DECREF(tmp);
-                Py_DECREF(ret);
-                ret = tmpret;
             }
         }
 
@@ -1222,19 +1215,23 @@ PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
                 Py_INCREF(ret);
             }
             else {
-                if (ret_is_small_unsigned) {
-                    tmpret = promote_types(tmp, ret, 0, ret_is_small_unsigned);
-                    if (tmpret == NULL) {
-                        Py_DECREF(tmp);
-                        Py_DECREF(ret);
-                        return NULL;
+                /* Only call promote if the types aren't the same dtype */
+                if (tmp != ret || !PyArray_ISNBO(tmp->byteorder)) {
+                    if (ret_is_small_unsigned) {
+                        tmpret = promote_types(tmp, ret, 0,
+                                                ret_is_small_unsigned);
+                        if (tmpret == NULL) {
+                            Py_DECREF(tmp);
+                            Py_DECREF(ret);
+                            return NULL;
+                        }
                     }
+                    else {
+                        tmpret = PyArray_PromoteTypes(tmp, ret);
+                    }
+                    Py_DECREF(ret);
+                    ret = tmpret;
                 }
-                else {
-                    tmpret = PyArray_PromoteTypes(tmp, ret);
-                }
-                Py_DECREF(ret);
-                ret = tmpret;
             }
         }
     }
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 7ebc83507cce..b433f81ba1ab 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1054,11 +1054,11 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
 
     if (descr->subarray) {
         PyObject *ret;
-        npy_intp newdims[2*MAX_DIMS];
+        npy_intp newdims[2*NPY_MAXDIMS];
         npy_intp *newstrides = NULL;
         memcpy(newdims, dims, nd*sizeof(npy_intp));
         if (strides) {
-            newstrides = newdims + MAX_DIMS;
+            newstrides = newdims + NPY_MAXDIMS;
             memcpy(newstrides, strides, nd*sizeof(npy_intp));
         }
         nd =_update_descr_and_dimensions(&descr, newdims,
@@ -1068,15 +1068,11 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
                                    data, flags, obj);
         return ret;
     }
-    if (nd < 0) {
-        PyErr_SetString(PyExc_ValueError,
-                        "number of dimensions must be >=0");
-        Py_DECREF(descr);
-        return NULL;
-    }
-    if (nd > MAX_DIMS) {
+
+    if ((unsigned int)nd > (unsigned int)NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
-                     "maximum number of dimensions is %d", MAX_DIMS);
+                     "number of dimensions must be within [0, %d]",
+                     NPY_MAXDIMS);
         Py_DECREF(descr);
         return NULL;
     }
@@ -1092,12 +1088,11 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         }
         PyArray_DESCR_REPLACE(descr);
         if (descr->type_num == NPY_STRING) {
-            descr->elsize = 1;
+            sd = descr->elsize = 1;
         }
         else {
-            descr->elsize = sizeof(PyArray_UCS4);
+            sd = descr->elsize = sizeof(PyArray_UCS4);
         }
-        sd = descr->elsize;
     }
 
     largest = NPY_MAX_INTP / sd;
@@ -1111,6 +1106,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
              */
             continue;
         }
+
         if (dim < 0) {
             PyErr_SetString(PyExc_ValueError,
                             "negative dimensions "  \
@@ -1118,14 +1114,15 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
             Py_DECREF(descr);
             return NULL;
         }
-        if (dim > largest) {
+
+        size *= dim;
+
+        if (size > largest) {
             PyErr_SetString(PyExc_ValueError,
                             "array is too big.");
             Py_DECREF(descr);
             return NULL;
         }
-        size *= dim;
-        largest /= dim;
     }
 
     self = (PyArrayObject *) subtype->tp_alloc(subtype, 0);
@@ -1139,15 +1136,15 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
     if (data == NULL) {
         self->flags = DEFAULT;
         if (flags) {
-            self->flags |= FORTRAN;
+            self->flags |= NPY_F_CONTIGUOUS;
             if (nd > 1) {
-                self->flags &= ~CONTIGUOUS;
+                self->flags &= ~NPY_C_CONTIGUOUS;
             }
-            flags = FORTRAN;
+            flags = NPY_F_CONTIGUOUS;
         }
     }
     else {
-        self->flags = (flags & ~UPDATEIFCOPY);
+        self->flags = (flags & ~NPY_UPDATEIFCOPY);
     }
     self->descr = descr;
     self->base = (PyObject *)NULL;
@@ -1176,7 +1173,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
     }
     else {
         self->dimensions = self->strides = NULL;
-        self->flags |= FORTRAN;
+        self->flags |= NPY_F_CONTIGUOUS;
     }
 
     if (data == NULL) {
@@ -1189,7 +1186,8 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         if (sd == 0) {
             sd = descr->elsize;
         }
-        if ((data = PyDataMem_NEW(sd)) == NULL) {
+        data = PyDataMem_NEW(sd);
+        if (data == NULL) {
             PyErr_NoMemory();
             goto fail;
         }
@@ -1481,7 +1479,7 @@ _array_from_buffer_3118(PyObject *obj, PyObject **out)
 
 
 /*NUMPY_API
- * Does not check for ENSURECOPY and NOTSWAPPED in flags
+ * Does not check for NPY_ENSURECOPY and NPY_NOTSWAPPED in flags
  * Steals a reference to newtype --- which can be NULL
  */
 NPY_NO_EXPORT PyObject *
@@ -1505,14 +1503,14 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         r = PyArray_FromArray((PyArrayObject *)op, newtype, flags);
     }
     else if (PyArray_IsScalar(op, Generic)) {
-        if (flags & UPDATEIFCOPY) {
+        if (flags & NPY_UPDATEIFCOPY) {
             goto err;
         }
         r = PyArray_FromScalar(op, newtype);
     }
     else if (newtype == NULL &&
                (newtype = _array_find_python_scalar_type(op))) {
-        if (flags & UPDATEIFCOPY) {
+        if (flags & NPY_UPDATEIFCOPY) {
             goto err;
         }
         r = Array_FromPyScalar(op, newtype);
@@ -1542,7 +1540,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     else {
         int isobject = 0;
 
-        if (flags & UPDATEIFCOPY) {
+        if (flags & NPY_UPDATEIFCOPY) {
             goto err;
         }
         if (newtype == NULL) {
@@ -1559,7 +1557,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
 
             /* necessary but not sufficient */
             Py_INCREF(newtype);
-            r = Array_FromSequence(op, newtype, flags & FORTRAN,
+            r = Array_FromSequence(op, newtype, flags & NPY_F_CONTIGUOUS,
                                    min_depth, max_depth);
             if (r == NULL && (thiserr=PyErr_Occurred())) {
                 if (PyErr_GivenExceptionMatches(thiserr,
@@ -1574,7 +1572,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 if (isobject) {
                     Py_INCREF(newtype);
                     r = ObjectArray_FromNestedList
-                        (op, newtype, flags & FORTRAN);
+                        (op, newtype, flags & NPY_F_CONTIGUOUS);
                     seq = TRUE;
                     Py_DECREF(newtype);
                 }
@@ -1626,16 +1624,16 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
 
 /*
  * flags is any of
- * CONTIGUOUS,
- * FORTRAN,
- * ALIGNED,
- * WRITEABLE,
- * NOTSWAPPED,
- * ENSURECOPY,
- * UPDATEIFCOPY,
- * FORCECAST,
- * ENSUREARRAY,
- * ELEMENTSTRIDES
+ * NPY_C_CONTIGUOUS (CONTIGUOUS),
+ * NPY_F_CONTIGUOUS (FORTRAN),
+ * NPY_ALIGNED,
+ * NPY_WRITEABLE,
+ * NPY_NOTSWAPPED,
+ * NPY_ENSURECOPY,
+ * NPY_UPDATEIFCOPY,
+ * NPY_FORCECAST,
+ * NPY_ENSUREARRAY,
+ * NPY_ELEMENTSTRIDES
  *
  * or'd (|) together
  *
@@ -1644,24 +1642,24 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
  * won't guarantee it -- it will depend on the object as to whether or
  * not it has such features.
  *
- * Note that ENSURECOPY is enough
- * to guarantee CONTIGUOUS, ALIGNED and WRITEABLE
+ * Note that NPY_ENSURECOPY is enough
+ * to guarantee NPY_C_CONTIGUOUS, NPY_ALIGNED and NPY_WRITEABLE
  * and therefore it is redundant to include those as well.
  *
- * BEHAVED == ALIGNED | WRITEABLE
- * CARRAY = CONTIGUOUS | BEHAVED
- * FARRAY = FORTRAN | BEHAVED
+ * NPY_BEHAVED == NPY_ALIGNED | NPY_WRITEABLE
+ * NPY_CARRAY = NPY_C_CONTIGUOUS | NPY_BEHAVED
+ * NPY_FARRAY = NPY_F_CONTIGUOUS | NPY_BEHAVED
  *
- * FORTRAN can be set in the FLAGS to request a FORTRAN array.
+ * NPY_F_CONTIGUOUS can be set in the FLAGS to request a FORTRAN array.
  * Fortran arrays are always behaved (aligned,
  * notswapped, and writeable) and not (C) CONTIGUOUS (if > 1d).
  *
- * UPDATEIFCOPY flag sets this flag in the returned array if a copy is
+ * NPY_UPDATEIFCOPY flag sets this flag in the returned array if a copy is
  * made and the base argument points to the (possibly) misbehaved array.
  * When the new array is deallocated, the original array held in base
  * is updated with the contents of the new array.
  *
- * FORCECAST will cause a cast to occur regardless of whether or not
+ * NPY_FORCECAST will cause a cast to occur regardless of whether or not
  * it is safe.
  */
 
@@ -1673,7 +1671,7 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context)
 {
     PyObject *obj;
-    if (requires & NOTSWAPPED) {
+    if (requires & NPY_NOTSWAPPED) {
         if (!descr && PyArray_Check(op) &&
             !PyArray_ISNBO(PyArray_DESCR(op)->byteorder)) {
             descr = PyArray_DescrNew(PyArray_DESCR(op));
@@ -1690,10 +1688,10 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
     if (obj == NULL) {
         return NULL;
     }
-    if ((requires & ELEMENTSTRIDES) &&
+    if ((requires & NPY_ELEMENTSTRIDES) &&
         !PyArray_ElementStrides(obj)) {
         PyObject *new;
-        new = PyArray_NewCopy((PyArrayObject *)obj, PyArray_ANYORDER);
+        new = PyArray_NewCopy((PyArrayObject *)obj, NPY_ANYORDER);
         Py_DECREF(obj);
         obj = new;
     }
@@ -1731,10 +1729,10 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
     }
 
     /*
-     * Can't cast unless ndim-0 array, FORCECAST is specified
+     * Can't cast unless ndim-0 array, NPY_FORCECAST is specified
      * or the cast is safe.
      */
-    if (!(flags & FORCECAST) && !PyArray_NDIM(arr) == 0 &&
+    if (!(flags & NPY_FORCECAST) && !PyArray_NDIM(arr) == 0 &&
         !PyArray_CanCastTo(oldtype, newtype)) {
         Py_DECREF(newtype);
         PyErr_SetString(PyExc_TypeError,
@@ -1744,26 +1742,27 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
     }
 
     /* Don't copy if sizes are compatible */
-    if ((flags & ENSURECOPY) || PyArray_EquivTypes(oldtype, newtype)) {
+    if ((flags & NPY_ENSURECOPY) || PyArray_EquivTypes(oldtype, newtype)) {
         arrflags = arr->flags;
-        if (arr->nd <= 1 && (flags & FORTRAN)) {
-            flags |= CONTIGUOUS;
+        if (arr->nd <= 1 && (flags & NPY_F_CONTIGUOUS)) {
+            flags |= NPY_C_CONTIGUOUS;
         }
-        copy = (flags & ENSURECOPY) ||
-            ((flags & CONTIGUOUS) && (!(arrflags & CONTIGUOUS)))
-            || ((flags & ALIGNED) && (!(arrflags & ALIGNED)))
+        copy = (flags & NPY_ENSURECOPY) ||
+            ((flags & NPY_C_CONTIGUOUS) && (!(arrflags & NPY_C_CONTIGUOUS)))
+            || ((flags & NPY_ALIGNED) && (!(arrflags & NPY_ALIGNED)))
             || (arr->nd > 1 &&
-                ((flags & FORTRAN) && (!(arrflags & FORTRAN))))
-            || ((flags & WRITEABLE) && (!(arrflags & WRITEABLE)));
+                ((flags & NPY_F_CONTIGUOUS) &&
+                 (!(arrflags & NPY_F_CONTIGUOUS))))
+            || ((flags & NPY_WRITEABLE) && (!(arrflags & NPY_WRITEABLE)));
 
         if (copy) {
-            if ((flags & UPDATEIFCOPY) &&
+            if ((flags & NPY_UPDATEIFCOPY) &&
                 (!PyArray_ISWRITEABLE(arr))) {
                 Py_DECREF(newtype);
                 PyErr_SetString(PyExc_ValueError, msg);
                 return NULL;
             }
-            if ((flags & ENSUREARRAY)) {
+            if ((flags & NPY_ENSUREARRAY)) {
                 subtype = &PyArray_Type;
             }
             ret = (PyArrayObject *)
@@ -1771,7 +1770,7 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
                                      arr->nd,
                                      arr->dimensions,
                                      NULL, NULL,
-                                     flags & FORTRAN,
+                                     flags & NPY_F_CONTIGUOUS,
                                      (PyObject *)arr);
             if (ret == NULL) {
                 return NULL;
@@ -1780,10 +1779,10 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
                 Py_DECREF(ret);
                 return NULL;
             }
-            if (flags & UPDATEIFCOPY)  {
-                ret->flags |= UPDATEIFCOPY;
+            if (flags & NPY_UPDATEIFCOPY)  {
+                ret->flags |= NPY_UPDATEIFCOPY;
                 ret->base = (PyObject *)arr;
-                PyArray_FLAGS(ret->base) &= ~WRITEABLE;
+                PyArray_FLAGS(ret->base) &= ~NPY_WRITEABLE;
                 Py_INCREF(arr);
             }
         }
@@ -1793,7 +1792,7 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
          */
         else {
             Py_DECREF(newtype);
-            if ((flags & ENSUREARRAY) &&
+            if ((flags & NPY_ENSUREARRAY) &&
                 !PyArray_CheckExact(arr)) {
                 Py_INCREF(arr->descr);
                 ret = (PyArrayObject *)
@@ -1821,20 +1820,20 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
      * array type and copy was not specified
      */
     else {
-        if ((flags & UPDATEIFCOPY) &&
+        if ((flags & NPY_UPDATEIFCOPY) &&
             (!PyArray_ISWRITEABLE(arr))) {
             Py_DECREF(newtype);
             PyErr_SetString(PyExc_ValueError, msg);
             return NULL;
         }
-        if ((flags & ENSUREARRAY)) {
+        if ((flags & NPY_ENSUREARRAY)) {
             subtype = &PyArray_Type;
         }
         ret = (PyArrayObject *)
             PyArray_NewFromDescr(subtype, newtype,
                                  arr->nd, arr->dimensions,
                                  NULL, NULL,
-                                 flags & FORTRAN,
+                                 flags & NPY_F_CONTIGUOUS,
                                  (PyObject *)arr);
         if (ret == NULL) {
             return NULL;
@@ -1843,10 +1842,10 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
             Py_DECREF(ret);
             return NULL;
         }
-        if (flags & UPDATEIFCOPY)  {
-            ret->flags |= UPDATEIFCOPY;
+        if (flags & NPY_UPDATEIFCOPY)  {
+            ret->flags |= NPY_UPDATEIFCOPY;
             ret->base = (PyObject *)arr;
-            PyArray_FLAGS(ret->base) &= ~WRITEABLE;
+            PyArray_FLAGS(ret->base) &= ~NPY_WRITEABLE;
             Py_INCREF(arr);
         }
     }
@@ -1875,9 +1874,9 @@ PyArray_FromStructInterface(PyObject *input)
     if (inter->two != 2) {
         goto fail;
     }
-    if ((inter->flags & NOTSWAPPED) != NOTSWAPPED) {
+    if ((inter->flags & NPY_NOTSWAPPED) != NPY_NOTSWAPPED) {
         endian = PyArray_OPPBYTE;
-        inter->flags &= ~NOTSWAPPED;
+        inter->flags &= ~NPY_NOTSWAPPED;
     }
 
     if (inter->flags & ARR_HAS_DESCR) {
@@ -1972,7 +1971,7 @@ PyArray_FromInterface(PyObject *input)
             if (res < 0) {
                 goto fail;
             }
-            dataflags &= ~WRITEABLE;
+            dataflags &= ~NPY_WRITEABLE;
         }
         attr = PyDict_GetItemString(inter, "offset");
         if (attr) {
@@ -2017,7 +2016,7 @@ PyArray_FromInterface(PyObject *input)
             goto fail;
         }
         if (PyObject_IsTrue(PyTuple_GET_ITEM(attr,1))) {
-            dataflags &= ~WRITEABLE;
+            dataflags &= ~NPY_WRITEABLE;
         }
     }
     attr = tstr;
@@ -2247,7 +2246,7 @@ PyArray_EnsureArray(PyObject *op)
         new = PyArray_FromScalar(op, NULL);
     }
     else {
-        new = PyArray_FromAny(op, NULL, 0, 0, ENSUREARRAY, NULL);
+        new = PyArray_FromAny(op, NULL, 0, 0, NPY_ENSUREARRAY, NULL);
     }
     Py_XDECREF(op);
     return new;
@@ -3340,11 +3339,11 @@ PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type,
     }
 
     if (!write) {
-        ret->flags &= ~WRITEABLE;
+        ret->flags &= ~NPY_WRITEABLE;
     }
     /* Store a reference for decref on deallocation */
     ret->base = buf;
-    PyArray_UpdateFlags(ret, ALIGNED);
+    PyArray_UpdateFlags(ret, NPY_ALIGNED);
     return (PyObject *)ret;
 }
 
@@ -3561,9 +3560,9 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
  *
  * If data is given, then flags is flags associated with data.
  * If strides is not given, then a contiguous strides array will be created
- * and the CONTIGUOUS bit will be set.  If the flags argument
- * has the FORTRAN bit set, then a FORTRAN-style strides array will be
- * created (and of course the FORTRAN flag bit will be set).
+ * and the NPY_C_CONTIGUOUS bit will be set.  If the flags argument
+ * has the NPY_F_CONTIGUOUS bit set, then a FORTRAN-style strides array will be
+ * created (and of course the NPY_F_CONTIGUOUS flag bit will be set).
  *
  * If data is not given but created here, then flags will be DEFAULT
  * and a non-zero flags argument can be used to indicate a FORTRAN style
@@ -3576,17 +3575,16 @@ _array_fill_strides(npy_intp *strides, npy_intp *dims, int nd, size_t itemsize,
 {
     int i;
     /* Only make Fortran strides if not contiguous as well */
-    if ((inflag & FORTRAN) && !(inflag & CONTIGUOUS)) {
+    if ((inflag & (NPY_F_CONTIGUOUS|NPY_C_CONTIGUOUS)) == NPY_F_CONTIGUOUS) {
         for (i = 0; i < nd; i++) {
             strides[i] = itemsize;
             itemsize *= dims[i] ? dims[i] : 1;
         }
-        *objflags |= FORTRAN;
         if (nd > 1) {
-            *objflags &= ~CONTIGUOUS;
+            *objflags = ((*objflags)|NPY_F_CONTIGUOUS) & ~NPY_C_CONTIGUOUS;
         }
         else {
-            *objflags |= CONTIGUOUS;
+            *objflags |= (NPY_F_CONTIGUOUS|NPY_C_CONTIGUOUS);
         }
     }
     else {
@@ -3594,12 +3592,11 @@ _array_fill_strides(npy_intp *strides, npy_intp *dims, int nd, size_t itemsize,
             strides[i] = itemsize;
             itemsize *= dims[i] ? dims[i] : 1;
         }
-        *objflags |= CONTIGUOUS;
         if (nd > 1) {
-            *objflags &= ~FORTRAN;
+            *objflags = ((*objflags)|NPY_C_CONTIGUOUS) & ~NPY_F_CONTIGUOUS;
         }
         else {
-            *objflags |= FORTRAN;
+            *objflags |= (NPY_C_CONTIGUOUS|NPY_F_CONTIGUOUS);
         }
     }
     return itemsize;
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index a3d704ac77d9..8db85bf8f0b5 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -824,7 +824,7 @@ array_ass_sub(PyArrayObject *self, PyObject *index, PyObject *op)
          * Doing "a[...] += 1" triggers assigning an array to itself,
          * so this check is needed.
          */
-        if (self == op) {
+        if ((PyObject *)self == op) {
             return 0;
         }
         else {

From 85993f895e7676f477f80e940a2ee925b3a0c19c Mon Sep 17 00:00:00 2001
From: Mark Wiebe <mwwiebe@gmail.com>
Date: Mon, 31 Jan 2011 19:40:25 -0800
Subject: [PATCH 3/3] ENH: iter: Add timing code, rewrite some sections to be
 faster/more clear

---
 numpy/core/src/multiarray/new_iterator.c.src | 889 ++++++++++---------
 1 file changed, 488 insertions(+), 401 deletions(-)

diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
index 49d8ad8b5bc8..caf8afb5ade3 100644
--- a/numpy/core/src/multiarray/new_iterator.c.src
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -18,6 +18,32 @@
 
 #include "lowlevel_strided_loops.h"
 
+/********** ITERATOR CONSTRUCTION TIMING **************/
+#define NPY_IT_CONSTRUCTION_TIMING 0
+
+#if NPY_IT_CONSTRUCTION_TIMING
+#define NPY_IT_TIME_POINT(var) { \
+            unsigned int hi, lo; \
+            __asm__ __volatile__ ( \
+                "rdtsc" \
+                : "=d" (hi), "=a" (lo)); \
+            var = (((unsigned long long)hi) << 32) | lo; \
+        }
+#define NPY_IT_PRINT_TIME_START(var) { \
+            printf("%30s: start\n", #var); \
+            c_temp = var; \
+        }
+#define NPY_IT_PRINT_TIME_VAR(var) { \
+            printf("%30s: %6.0f clocks\n", #var, \
+                    ((double)(var-c_temp))); \
+            c_temp = var; \
+        }
+#else
+#define NPY_IT_TIME_POINT(var)
+#endif
+
+/******************************************************/
+
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_IT_DBG_TRACING 0
 
@@ -371,6 +397,28 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
     double subtype_priority = NPY_PRIORITY;
     PyTypeObject *subtype = &PyArray_Type;
 
+#if NPY_IT_CONSTRUCTION_TIMING
+    npy_intp c_temp,
+            c_start,
+            c_check_op_axes,
+            c_check_global_flags,
+            c_calculate_ndim,
+            c_malloc,
+            c_prepare_operands,
+            c_fill_axisdata,
+            c_compute_index_strides,
+            c_apply_forced_iteration_order,
+            c_find_best_axis_ordering,
+            c_get_priority_subtype,
+            c_find_output_common_dtype,
+            c_check_casting,
+            c_allocate_arrays,
+            c_coalesce_axes,
+            c_prepare_buffers;
+#endif
+
+    NPY_IT_TIME_POINT(c_start);
+
     if (niter > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
             "Cannot construct an iterator with more than %d operands "
@@ -383,11 +431,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
 
+    NPY_IT_TIME_POINT(c_check_op_axes);
+
     /* Check the global iterator flags */
     if (!npyiter_check_global_flags(flags, &itflags)) {
         return NULL;
     }
 
+    NPY_IT_TIME_POINT(c_check_global_flags);
+
     /* Calculate how many dimensions the iterator should have */
     ndim = npyiter_calculate_ndim(niter, op_in, oa_ndim);
 
@@ -397,10 +449,14 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         ndim = 1;
     }
 
+    NPY_IT_TIME_POINT(c_calculate_ndim);
+
     /* Allocate memory for the iterator */
     iter = (NpyIter*)
                 PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
 
+    NPY_IT_TIME_POINT(c_malloc);
+
     /* Fill in the basic data */
     NIT_ITFLAGS(iter) = itflags;
     NIT_NDIM(iter) = ndim;
@@ -424,6 +480,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
     /* Set resetindex to zero as well (it's just after the resetdataptr) */
     op_dataptr[niter] = 0;
 
+    NPY_IT_TIME_POINT(c_prepare_operands);
+
     /*
      * Initialize buffer data (must set the buffers and transferdata
      * to NULL before we might deallocate the iterator).
@@ -443,6 +501,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
 
+    NPY_IT_TIME_POINT(c_fill_axisdata);
+
     if (itflags&NPY_ITFLAG_BUFFER) {
         /*
          * If buffering is enabled and no buffersize was given, use a default
@@ -466,6 +526,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
      */
     npyiter_compute_index_strides(iter, flags);
 
+    NPY_IT_TIME_POINT(c_compute_index_strides);
+
     /* Initialize the perm to the identity */
     perm = NIT_PERM(iter);
     for(idim = 0; idim < ndim; ++idim) {
@@ -478,6 +540,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
     npyiter_apply_forced_iteration_order(iter, order);
     itflags = NIT_ITFLAGS(iter);
 
+    NPY_IT_TIME_POINT(c_apply_forced_iteration_order);
+
     /* Set some flags for allocated outputs */
     for (iiter = 0; iiter < niter; ++iiter) {
         if (op[iiter] == NULL) {
@@ -515,11 +579,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         itflags = NIT_ITFLAGS(iter);
     }
 
+    NPY_IT_TIME_POINT(c_find_best_axis_ordering);
+
     if (need_subtype) {
         npyiter_get_priority_subtype(niter, op, op_itflags,
                                      &subtype_priority, &subtype);
     }
 
+    NPY_IT_TIME_POINT(c_get_priority_subtype);
+
     /*
      * If an automatically allocated output didn't have a specified
      * dtype, we need to figure it out now, before allocating the outputs.
@@ -544,9 +612,11 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
             NPY_IT_DBG_PRINTF("Iterator: Replacing all data types\n");
             /* Replace all the data types */
             for (iiter = 0; iiter < niter; ++iiter) {
-                Py_XDECREF(op_dtype[iiter]);
-                Py_INCREF(dtype);
-                op_dtype[iiter] = dtype;
+                if (op_dtype[iiter] != dtype) {
+                    Py_XDECREF(op_dtype[iiter]);
+                    Py_INCREF(dtype);
+                    op_dtype[iiter] = dtype;
+                }
             }
         }
         else {
@@ -562,6 +632,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         Py_DECREF(dtype);
     }
 
+    NPY_IT_TIME_POINT(c_find_output_common_dtype);
+
     /*
      * All of the data types have been settled, so it's time
      * to check that data type conversions are following the
@@ -572,6 +644,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
 
+    NPY_IT_TIME_POINT(c_check_casting);
+
     /*
      * At this point, the iteration order has been finalized. so
      * any allocation of ops that were NULL, or any temporary
@@ -584,6 +658,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
 
+    NPY_IT_TIME_POINT(c_allocate_arrays);
+
     /*
      * Finally, if coords weren't requested,
      * it may be possible to coalesce some axes together.
@@ -602,6 +678,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         op_dataptr = NIT_RESETDATAPTR(iter);
     }
 
+    NPY_IT_TIME_POINT(c_coalesce_axes);
+
     /*
      * Now that the axes are finished, check whether we can apply
      * the single iteration optimization to the iternext function.
@@ -657,6 +735,29 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         }
     }
 
+    NPY_IT_TIME_POINT(c_prepare_buffers);
+
+#if NPY_IT_CONSTRUCTION_TIMING
+    printf("\nIterator construction timing:\n");
+    NPY_IT_PRINT_TIME_START(c_start);
+    NPY_IT_PRINT_TIME_VAR(c_check_op_axes);
+    NPY_IT_PRINT_TIME_VAR(c_check_global_flags);
+    NPY_IT_PRINT_TIME_VAR(c_calculate_ndim);
+    NPY_IT_PRINT_TIME_VAR(c_malloc);
+    NPY_IT_PRINT_TIME_VAR(c_prepare_operands);
+    NPY_IT_PRINT_TIME_VAR(c_fill_axisdata);
+    NPY_IT_PRINT_TIME_VAR(c_compute_index_strides);
+    NPY_IT_PRINT_TIME_VAR(c_apply_forced_iteration_order);
+    NPY_IT_PRINT_TIME_VAR(c_find_best_axis_ordering);
+    NPY_IT_PRINT_TIME_VAR(c_get_priority_subtype);
+    NPY_IT_PRINT_TIME_VAR(c_find_output_common_dtype);
+    NPY_IT_PRINT_TIME_VAR(c_check_casting);
+    NPY_IT_PRINT_TIME_VAR(c_allocate_arrays);
+    NPY_IT_PRINT_TIME_VAR(c_coalesce_axes);
+    NPY_IT_PRINT_TIME_VAR(c_prepare_buffers);
+    printf("\n");
+#endif
+
     return iter;
 }
 
@@ -3242,297 +3343,200 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
     npy_intp iiter, niter = NIT_NITER(iter);
 
     npy_intp ondim;
-    char *odataptr;
-    NpyIter_AxisData *axisdata0, *axisdata;
+    NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
-    PyArrayObject **op = NIT_OPERANDS(iter);
-
-    axisdata0 = NIT_AXISDATA(iter);
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
-
-    /* Process the first operand */
-    if (op_axes == NULL || op_axes[0] == NULL) {
-        /* Default broadcasting rules if op_axes is not specified */
-        axisdata = axisdata0;
-        ondim = (op[0] == NULL) ? 0 : PyArray_NDIM(op[0]);
-        odataptr = op_dataptr[0];
-        /* Possible if op_axes are being used, but op_axes[0] is NULL */
-        if (ondim > ndim) {
-            PyErr_SetString(PyExc_ValueError,
-                    "Iterator input has more dimensions than allowed "
-                    "by the 'op_axes' specified");
-            return 0;
-        }
-        for (idim = 0; idim < ondim; ++idim) {
-            npy_intp shape;
-
-            /* op[0] != NULL, because we set ondim to 0 in that case */
-            shape = PyArray_DIM(op[0], ondim-idim-1);
-
-            NAD_SHAPE(axisdata) = shape;
-            NAD_COORD(axisdata) = 0;
-            if (shape == 1) {
-                NAD_STRIDES(axisdata)[0] = 0;
-            }
-            else {
-                NAD_STRIDES(axisdata)[0] = PyArray_STRIDE(op[0], ondim-idim-1);
-            }
-            NAD_PTRS(axisdata)[0] = odataptr;
-
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
-        }
-        for (idim = ondim; idim < ndim; ++idim) {
-            NAD_SHAPE(axisdata) = 1;
-            NAD_COORD(axisdata) = 0;
-            NAD_STRIDES(axisdata)[0] = 0;
-            NAD_PTRS(axisdata)[0] = odataptr;
+    PyArrayObject **op = NIT_OPERANDS(iter), *op_cur;
+    npy_intp broadcast_shape[NPY_MAXDIMS];
 
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
-        }
+    /* First broadcast the shapes together */
+    for (idim = 0; idim < ndim; ++idim) {
+        broadcast_shape[idim] = 1;
     }
-    else {
-        npy_intp *axes = op_axes[0];
-
-        /* Use op_axes to choose the axes */
-        axisdata = axisdata0;
-        ondim = (op[0] == NULL) ? ndim : PyArray_NDIM(op[0]);
-        odataptr = op_dataptr[0];
-        for (idim = 0; idim < ndim; ++idim) {
-            npy_intp i = axes[ndim-idim-1];
-            if (i < 0) {
-                NAD_SHAPE(axisdata) = 1;
-                NAD_COORD(axisdata) = 0;
-                NAD_STRIDES(axisdata)[0] = 0;
-                NAD_PTRS(axisdata)[0] = odataptr;
-            }
-            else if (i < ondim) {
-                npy_intp shape;
-
-                if (op[0] != NULL) {
-                    shape = PyArray_DIM(op[0], i);
-                }
-                else {
-                    shape = 1;
-                }
+    for (iiter = 0; iiter < niter; ++iiter) {
+        op_cur = op[iiter];
+        if (op_cur != NULL) {
+            npy_intp *shape = PyArray_DIMS(op_cur);
+            ondim = PyArray_NDIM(op_cur);
 
-                NAD_SHAPE(axisdata) = shape;
-                NAD_COORD(axisdata) = 0;
-                if (shape == 1) {
-                    NAD_STRIDES(axisdata)[0] = 0;
+            if (op_axes == NULL || op_axes[iiter] == NULL) {
+                /*
+                 * Possible if op_axes are being used, but
+                 * op_axes[iiter] is NULL
+                 */
+                if (ondim > ndim) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "input operand has more dimensions than allowed "
+                            "by the axis remapping");
+                    return 0;
                 }
-                else {
-                    NAD_STRIDES(axisdata)[0] = PyArray_STRIDE(op[0], i);
+                for (idim = 0; idim < ondim; ++idim) {
+                    npy_intp bshape = broadcast_shape[idim+ndim-ondim],
+                                      op_shape = shape[idim];
+                    if (bshape == 1) {
+                        broadcast_shape[idim+ndim-ondim] = op_shape;
+                    }
+                    else if (bshape != op_shape && op_shape != 1) {
+                        goto broadcast_error;
+                    }
                 }
-                NAD_PTRS(axisdata)[0] = odataptr;
             }
             else {
-                PyErr_Format(PyExc_ValueError,
-                        "Iterator input op_axes[0][%d] (==%d) is not a valid "
-                        "axis of op[0], which has %d dimensions",
-                        (int)(ndim-idim-1), (int)i, (int)ondim);
-                return 0;
+                npy_intp *axes = op_axes[iiter];
+                for (idim = 0; idim < ndim; ++idim) {
+                    npy_intp i = axes[idim];
+                    if (i >= 0) {
+                        if (i < ondim) {
+                            npy_intp bshape = broadcast_shape[idim],
+                                              op_shape = shape[i];
+                            if (bshape == 1) {
+                                broadcast_shape[idim] = op_shape;
+                            }
+                            else if (bshape != op_shape && op_shape != 1) {
+                                goto broadcast_error;
+                            }
+                        }
+                        else {
+                            PyErr_Format(PyExc_ValueError,
+                                    "Iterator input op_axes[%d][%d] (==%d) "
+                                    "is not a valid axis of op[%d], which "
+                                    "has %d dimensions ",
+                                    (int)iiter, (int)(ndim-idim-1), (int)i,
+                                    (int)iiter, (int)ondim);
+                            return 0;
+                        }
+                    }
+                }
             }
-
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
         }
     }
 
-    /*
-     * Process the rest of the operands, using the broadcasting rules
-     * to combine them.
-     */
-    for (iiter = 1; iiter < niter; ++iiter) {
-        if (op_axes == NULL || op_axes[iiter] == NULL) {
-            axisdata = axisdata0;
-            ondim = (op[iiter] == NULL) ? 0 : PyArray_NDIM(op[iiter]);
-            odataptr = op_dataptr[iiter];
-            /* Possible if op_axes are being used, but op_axes[iiter] is NULL */
-            if (ondim > ndim) {
-                PyErr_SetString(PyExc_ValueError,
-                        "input operand has more dimensions than allowed "
-                        "by the axis remapping");
-                return 0;
-            }
-            for (idim = 0; idim < ondim; ++idim) {
-                npy_intp shape;
-
-                /* op[iiter] != NULL, because we set ondim to 0 in that case */
-                shape = PyArray_DIM(op[iiter], ondim-idim-1);
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
 
-                if (shape == 1) {
-                    NAD_STRIDES(axisdata)[iiter] = 0;
-                }
-                else {
-                    if (NAD_SHAPE(axisdata) == 1) {
-                        NAD_SHAPE(axisdata) = shape;
-                    }
-                    else if (NAD_SHAPE(axisdata) != shape) {
-                        goto broadcast_error;
-                    }
-                    NAD_STRIDES(axisdata)[iiter] = PyArray_STRIDE(
-                                                      op[iiter], ondim-idim-1);
-                }
-                NAD_PTRS(axisdata)[iiter] = odataptr;
+    /* Now process the operands, filling in the axisdata */
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp bshape = broadcast_shape[ndim-idim-1];
+        npy_intp *strides = NAD_STRIDES(axisdata);
 
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
-            }
-            for (idim = ondim; idim < ndim; ++idim) {
-                NAD_STRIDES(axisdata)[iiter] = 0;
-                NAD_PTRS(axisdata)[iiter] = odataptr;
+        NAD_SHAPE(axisdata) = bshape;
+        NAD_COORD(axisdata) = 0;
+        memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*niter);
 
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
-            }
-        }
-        else {
-            npy_intp *axes = op_axes[iiter];
+        for (iiter = 0; iiter < niter; ++iiter) {
+            op_cur = op[iiter];
 
-            /* Use op_axes to choose the axes */
-            axisdata = axisdata0;
-            ondim = (op[iiter] == NULL) ? ndim : PyArray_NDIM(op[iiter]);
-            odataptr = op_dataptr[iiter];
-            for (idim = 0; idim < ndim; ++idim) {
-                npy_intp i = axes[ndim-idim-1];
-                if (i < 0) {
-                    NAD_STRIDES(axisdata)[iiter] = 0;
-                    NAD_PTRS(axisdata)[iiter] = odataptr;
+            if (op_axes == NULL || op_axes[iiter] == NULL) {
+                if (op_cur == NULL) {
+                    strides[iiter] = 0;
                 }
-                else if (i < ondim) {
-                    npy_intp shape;
-
-                    if (op[iiter] != NULL) {
-                        shape = PyArray_DIM(op[iiter], i);
-                    }
-                    else {
-                        shape = 1;
-                    }
-
-                    if (shape == 1) {
-                        NAD_STRIDES(axisdata)[iiter] = 0;
+                else {
+                    ondim = PyArray_NDIM(op_cur);
+                    if (bshape == 1) {
+                        strides[iiter] = 0;
+                        if (idim >= ondim && !output_scalars &&
+                                        (op_flags[iiter]&NPY_ITER_NO_BROADCAST)) {
+                            goto operand_different_than_broadcast;
+                        }
                     }
-                    else {
-                        if (NAD_SHAPE(axisdata) == 1) {
-                            NAD_SHAPE(axisdata) = shape;
+                    else if (idim >= ondim ||
+                                        PyArray_DIM(op_cur, ondim-idim-1) == 1) {
+                        strides[iiter] = 0;
+                        if (op_flags[iiter]&NPY_ITER_NO_BROADCAST) {
+                            goto operand_different_than_broadcast;
                         }
-                        else if (NAD_SHAPE(axisdata) != shape) {
-                            goto broadcast_error;
+                        /* If it's writeable, this means a reduction */
+                        if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                            if (!(flags&NPY_ITER_REDUCE_OK)) {
+                                PyErr_SetString(PyExc_ValueError,
+                                        "output operand requires a reduction, but "
+                                        "reduction is not enabled");
+                                return 0;
+                            }
+                            if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                                PyErr_SetString(PyExc_ValueError,
+                                        "output operand requires a reduction, but "
+                                        "is flagged as write-only, not "
+                                        "read-write");
+                                return 0;
+                            }
+                            NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                            op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
                         }
-                        NAD_STRIDES(axisdata)[iiter] =
-                                                PyArray_STRIDE(op[iiter], i);
                     }
-                    NAD_PTRS(axisdata)[iiter] = odataptr;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                            "Iterator input op_axes[%d][%d] (==%d) is not a "
-                            "valid axis of op[%d], which has %d dimensions ",
-                            (int)iiter, (int)(ndim-idim-1), (int)i,
-                            (int)iiter, (int)ondim);
-                    return 0;
+                    else {
+                        strides[iiter] = PyArray_STRIDE(op_cur, ondim-idim-1);
+                    }
                 }
-
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
             }
-        }
-    }
-
-    /* Go through and check for operands for broadcasting and reduction */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op[iiter] != NULL) {
-            /*
-             * If broadcasting is disallowed for this operand,
-             * unless scalars are output, which means all operands are scalar
-             * and no broadcasting errors could occur
-             */
-            if ((op_flags[iiter]&NPY_ITER_NO_BROADCAST) && !output_scalars) {
-                npy_intp *axes;
-
-                axes = op_axes ? op_axes[iiter] : NULL;
-                axisdata = axisdata0;
-                for (idim = 0; idim < ndim; ++idim) {
-                    npy_intp i;
-                    if (axes) {
-                        i = axes[ndim-idim-1];
-                    }
-                    else {
-                        i = ndim-idim-1;
+            else {
+                npy_intp *axes = op_axes[iiter];
+                npy_intp i = axes[ndim-idim-1];
+                if (i >= 0) {
+                    if (bshape == 1 || op_cur == NULL) {
+                        strides[iiter] = 0;
                     }
-                    if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
-                        if (PyArray_DIM(op[iiter], i) != NAD_SHAPE(axisdata)) {
+                    else if (PyArray_DIM(op_cur, i) == 1) {
+                        strides[iiter] = 0;
+                        if (op_flags[iiter]&NPY_ITER_NO_BROADCAST) {
                             goto operand_different_than_broadcast;
                         }
-                    }
-                    /*
-                     * Also disallow broadcasting by adding additional
-                     * dimensions, unless that part of the broadcasting
-                     * was done explicitly through op_axes.
-                     */
-                    else if (!axes) {
-                        goto operand_different_than_broadcast;
-                    }
-                    /* If its writeable, this may cause a reduction */
-                    else if ((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
-                                                    NAD_SHAPE(axisdata) > 1) {
-                        if (!(flags&NPY_ITER_REDUCE_OK)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "operand requires a reduction, but "
-                                    "reduction is not enabled");
-                            return 0;
-                        }
-                        if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "operand requires a reduction, but "
-                                    "is flagged as write-only, not "
-                                    "read-write");
-                            return 0;
+                        /* If it's writeable, this means a reduction */
+                        if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                            if (!(flags&NPY_ITER_REDUCE_OK)) {
+                                PyErr_SetString(PyExc_ValueError,
+                                        "output operand requires a reduction, but "
+                                        "reduction is not enabled");
+                                return 0;
+                            }
+                            if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                                PyErr_SetString(PyExc_ValueError,
+                                        "output operand requires a reduction, but "
+                                        "is flagged as write-only, not "
+                                        "read-write");
+                                return 0;
+                            }
+                            NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                            op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
                         }
-                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                        op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
                     }
-
-                    NIT_ADVANCE_AXISDATA(axisdata, 1);
+                    else {
+                        strides[iiter] = PyArray_STRIDE(op_cur, i);
+                    }
                 }
-            }
-            /* Check whether this operand includes any reduction */
-            else if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
-                axisdata = axisdata0;
-                for (idim = 0; idim < ndim; ++idim) {
-                    /*
-                     * If the stride is 0 and the shape is bigger than
-                     * one, that's a reduction.
-                     */
-                    if (NAD_SHAPE(axisdata) > 1 &&
-                                    NAD_STRIDES(axisdata)[iiter] == 0) {
+                else if (bshape == 1) {
+                    strides[iiter] = 0;
+                }
+                else {
+                    strides[iiter] = 0;
+                    /* If it's writeable, this means a reduction */
+                    if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
                         if (!(flags&NPY_ITER_REDUCE_OK)) {
                             PyErr_SetString(PyExc_ValueError,
-                                    "operand requires a reduction, but "
+                                    "output operand requires a reduction, but "
                                     "reduction is not enabled");
                             return 0;
                         }
                         if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
                             PyErr_SetString(PyExc_ValueError,
-                                    "operand requires a reduction, but "
+                                    "output operand requires a reduction, but "
                                     "is flagged as write-only, not "
                                     "read-write");
                             return 0;
                         }
                         NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
                         op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
-                        break;
                     }
-
-                    NIT_ADVANCE_AXISDATA(axisdata, 1);
                 }
             }
         }
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
     }
 
     /* Now fill in the ITERSIZE member */
-    NIT_ITERSIZE(iter) = 1;
-    axisdata = axisdata0;
-    for (idim = 0; idim < ndim; ++idim) {
-        NIT_ITERSIZE(iter) *= NAD_SHAPE(axisdata);
-
-        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    NIT_ITERSIZE(iter) = broadcast_shape[0];
+    for (idim = 1; idim < ndim; ++idim) {
+        NIT_ITERSIZE(iter) *= broadcast_shape[idim];
     }
     /* The range defaults to everything */
     NIT_ITERSTART(iter) = 0;
@@ -3740,38 +3744,62 @@ npyiter_replace_axisdata(NpyIter *iter, npy_intp iiter,
      */
     axisdata = axisdata0;
 
-    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        char p;
-        npy_intp i, shape;
+    if (op_axes != NULL) {
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            char p;
+            npy_intp i, shape;
 
-        /* Apply the perm to get the original axis */
-        p = perm[idim];
-        if (p < 0) {
-            i = ndim+p;
-        }
-        else {
-            i = ndim-p-1;
-        }
+            /* Apply the perm to get the original axis */
+            p = perm[idim];
+            if (p < 0) {
+                i = op_axes[ndim+p];
+            }
+            else {
+                i = op_axes[ndim-p-1];
+            }
 
-        /* Apply op_axes */
-        if (op_axes != NULL) {
-            i = op_axes[i];
-        }
-        else {
-            i -= (ndim - op_ndim);
+            if ((npy_uintp)i < (npy_uintp)op_ndim) {
+                shape = PyArray_DIM(op, i);
+                if (shape != 1) {
+                    npy_intp stride = PyArray_STRIDE(op, i);
+                    if (p < 0) {
+                        /* If the perm entry is negative, flip the axis */
+                        NAD_STRIDES(axisdata)[iiter] = -stride;
+                        baseoffset += stride*(shape-1);
+                    }
+                    else {
+                        NAD_STRIDES(axisdata)[iiter] = stride;
+                    }
+                }
+            }
         }
+    }
+    else {
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            char p;
+            npy_intp i, shape;
 
-        if (i >= 0 && i < op_ndim) {
-            shape = PyArray_DIM(op, i);
-            if (shape != 1) {
-                npy_intp stride = PyArray_STRIDE(op, i);
-                if (p < 0) {
-                    /* If the perm entry is negative, flip the axis */
-                    NAD_STRIDES(axisdata)[iiter] = -stride;
-                    baseoffset += stride*(shape-1);
-                }
-                else {
-                    NAD_STRIDES(axisdata)[iiter] = stride;
+            /* Apply the perm to get the original axis */
+            p = perm[idim];
+            if (p < 0) {
+                i = op_ndim+p;
+            }
+            else {
+                i = op_ndim-p-1;
+            }
+
+            if (i >= 0) {
+                shape = PyArray_DIM(op, i);
+                if (shape != 1) {
+                    npy_intp stride = PyArray_STRIDE(op, i);
+                    if (p < 0) {
+                        /* If the perm entry is negative, flip the axis */
+                        NAD_STRIDES(axisdata)[iiter] = -stride;
+                        baseoffset += stride*(shape-1);
+                    }
+                    else {
+                        NAD_STRIDES(axisdata)[iiter] = stride;
+                    }
                 }
             }
         }
@@ -4266,119 +4294,149 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
     npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS],
              stride = op_dtype->elsize;
     char reversestride[NPY_MAXDIMS], anyreverse = 0;
-    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
-    npy_intp tmp_op_axes = -1;
-    npy_intp i, array_i[NPY_MAXDIMS];
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    npy_intp i;
 
     PyArrayObject *ret;
 
-    /*
-     * If it's an automatically allocated output, start by assuming
-     * the shape will have the same length as the iterator
-     */
-    if (shape == NULL) {
-        /*
-         * If it's a scalar output, trigger scalar allocation below
-         * by making an op_axes of -1
-         */
-        if (op_ndim == 0 && ndim == 1 && op_axes == NULL) {
-            op_axes = &tmp_op_axes;
+    /* If it's a scalar, don't need to check the axes */
+    if (op_ndim == 0) {
+        Py_INCREF(op_dtype);
+        ret = (PyArrayObject *)PyArray_NewFromDescr(subtype, op_dtype, 0,
+                               NULL, NULL, NULL, 0, NULL);
+
+        /* Double-check that the subtype didn't mess with the dimensions */
+        if (PyArray_NDIM(ret) != 0) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Iterator automatic output has an array subtype "
+                    "which changed the dimensions of the output");
+            Py_DECREF(ret);
+            return NULL;
         }
-        op_ndim = ndim;
-    }
 
-    /* Initially no strides have been set */
-    for (idim = 0; idim < op_ndim; ++idim) {
-        strides[idim] = NPY_MAX_INTP;
-        reversestride[idim] = 0;
+        return ret;
     }
 
-    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        char p;
-
-        /* Apply the perm to get the original axis */
-        p = perm[idim];
-        if (p < 0) {
-            i = ndim+p;
-        }
-        else {
-            i = ndim-p-1;
-        }
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
 
-        /* Apply op_axes */
-        if (op_axes != NULL) {
-            i = op_axes[i];
-        }
-        else {
-            i -= (ndim - op_ndim);
-        }
+    memset(reversestride, 0, NPY_MAXDIMS);
+    /* Initialize the strides to invalid values */
+    for (i = 0; i < NPY_MAXDIMS; ++i) {
+        strides[i] = NPY_MAX_INTP;
+    }
 
-        array_i[idim] = i;
+    if (op_axes != NULL) {
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            char p;
 
-        if (i >= 0) {
-            NPY_IT_DBG_PRINTF("Iterator: Setting allocated stride %d "
-                                "for iterator dimension %d to %d\n", (int)i,
-                                (int)idim, (int)stride);
-            strides[i] = stride;
+            /* Apply the perm to get the original axis */
+            p = perm[idim];
             if (p < 0) {
-                reversestride[i] = 1;
-                anyreverse = 1;
-            }
-            if (shape == NULL) {
-                new_shape[i] = NAD_SHAPE(axisdata);
-                stride *= new_shape[i];
+                i = op_axes[ndim+p];
             }
             else {
-                stride *= shape[i];
+                i = op_axes[ndim-p-1];
             }
-        }
-    }
 
-    /*
-     * If custom axes were specified, some dimensions may not have been used.
-     * Add the REDUCE itflag if this creates a reduction situation.
-     */
-    if (shape == NULL) {
-        axisdata = NIT_AXISDATA(iter);
-        for (idim = 0; idim < op_ndim; ++idim) {
-            i = array_i[idim];
-            NPY_IT_DBG_PRINTF("Iterator: Checking output "
-                          "dimension %d (iterator dim %d) with stride %d\n",
-                          (int)i, (int)idim, (int)strides[idim]);
-            if (i < 0) {
-                NPY_IT_DBG_PRINTF("Iterator: The axis wasn't used, "
-                                  "and its dimension is %d\n",
-                                  (int)NAD_SHAPE(axisdata));
-                /*
-                 * If deleting this axis produces a reduction, but
-                 * reduction wasn't enabled, throw an error
-                 */
-                if (NAD_SHAPE(axisdata) != 1) {
-                    if (!(flags&NPY_ITER_REDUCE_OK)) {
-                        PyErr_SetString(PyExc_ValueError,
-                                "output requires a reduction, but "
-                                "reduction is not enabled");
-                        return NULL;
-                    }
-                    if (!((*op_itflags)&NPY_OP_ITFLAG_READ)) {
+            if (i >= 0) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting allocated stride %d "
+                                    "for iterator dimension %d to %d\n", (int)i,
+                                    (int)idim, (int)stride);
+                strides[i] = stride;
+                if (p < 0) {
+                    reversestride[i] = 1;
+                    anyreverse = 1;
+                }
+                else {
+                    reversestride[i] = 0;
+                }
+                if (shape == NULL) {
+                    new_shape[i] = NAD_SHAPE(axisdata);
+                    stride *= new_shape[i];
+                    if (i >= ndim) {
                         PyErr_SetString(PyExc_ValueError,
-                                "output requires a reduction, but "
-                                "is flagged as write-only, not read-write");
+                                "automatically allocated output array "
+                                "specified with an inconsistent axis mapping");
                         return NULL;
                     }
+                }
+                else {
+                    stride *= shape[i];
+                }
+            }
+            else {
+                if (shape == NULL) {
+                    /*
+                     * If deleting this axis produces a reduction, but
+                     * reduction wasn't enabled, throw an error
+                     */
+                    if (NAD_SHAPE(axisdata) != 1) {
+                        if (!(flags&NPY_ITER_REDUCE_OK)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "output requires a reduction, but "
+                                    "reduction is not enabled");
+                            return NULL;
+                        }
+                        if (!((*op_itflags)&NPY_OP_ITFLAG_READ)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "output requires a reduction, but "
+                                    "is flagged as write-only, not read-write");
+                            return NULL;
+                        }
 
-                    NPY_IT_DBG_PRINTF("Iterator: Indicating that a reduction "
-                                      "is occurring\n");
-                    /* Indicate that a reduction is occurring */
-                    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                    (*op_itflags) |= NPY_OP_ITFLAG_REDUCE;
+                        NPY_IT_DBG_PRINTF("Iterator: Indicating that a reduction "
+                                          "is occurring\n");
+                        /* Indicate that a reduction is occurring */
+                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                        (*op_itflags) |= NPY_OP_ITFLAG_REDUCE;
+                    }
                 }
             }
+        }
+    }
+    else {
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            char p;
 
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
+            /* Apply the perm to get the original axis */
+            p = perm[idim];
+            if (p < 0) {
+                i = op_ndim+p;
+            }
+            else {
+                i = op_ndim-p-1;
+            }
+
+            if (i >= 0) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting allocated stride %d "
+                                    "for iterator dimension %d to %d\n", (int)i,
+                                    (int)idim, (int)stride);
+                strides[i] = stride;
+                if (p < 0) {
+                    reversestride[i] = 1;
+                    anyreverse = 1;
+                }
+                else {
+                    reversestride[i] = 0;
+                }
+                if (shape == NULL) {
+                    new_shape[i] = NAD_SHAPE(axisdata);
+                    stride *= new_shape[i];
+                }
+                else {
+                    stride *= shape[i];
+                }
+            }
         }
+    }
 
+    /*
+     * If custom axes were specified, some dimensions may not have been used.
+     * Add the REDUCE itflag if this creates a reduction situation.
+     */
+    if (shape == NULL) {
         /* Ensure there are no dimension gaps in op_axes, and find op_ndim */
         op_ndim = ndim;
         if (op_axes != NULL) {
@@ -4395,7 +4453,7 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                  */
                 else if (op_ndim != ndim) {
                     PyErr_SetString(PyExc_ValueError,
-                            "automatically allocated reduction output array "
+                            "automatically allocated output array "
                             "specified with an inconsistent axis mapping");
                     return NULL;
                 }
@@ -4682,49 +4740,54 @@ npyiter_allocate_arrays(NpyIter *iter,
          * the inner stride of this operand works for the whole
          * array, we can set NPY_OP_ITFLAG_BUFNEVER.
          */
-        if (PyArray_NDIM(op[iiter]) > 0 && (itflags&NPY_ITFLAG_BUFFER) &&
-                        !(op_itflags[iiter]&NPY_OP_ITFLAG_CAST)) {
-            npy_intp stride, shape, innerstride = 0, innershape;
+        if ((itflags&NPY_ITFLAG_BUFFER) && !(op_itflags[iiter]&NPY_OP_ITFLAG_CAST)) {
             NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-            npy_intp sizeof_axisdata =
-                                NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
-            /* Find stride of the first non-empty shape */
-            for (idim = 0; idim < ndim; ++idim) {
-                innershape = NAD_SHAPE(axisdata);
-                if (innershape != 1) {
-                    innerstride = NAD_STRIDES(axisdata)[iiter];
-                    break;
-                }
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            if (ndim == 1) {
+                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+                NBF_STRIDES(bufferdata)[iiter] = NAD_STRIDES(axisdata)[iiter];
             }
-            ++idim;
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
-            /* Check that everything could have coalesced together */
-            for (; idim < ndim; ++idim) {
-                stride = NAD_STRIDES(axisdata)[iiter];
-                shape = NAD_SHAPE(axisdata);
-                if (shape != 1) {
-                    /*
-                     * If N times the inner stride doesn't equal this
-                     * stride, the multi-dimensionality is needed.
-                     */
-                    if (innerstride*innershape != stride) {
+            else if (PyArray_NDIM(op[iiter]) > 0) {
+                npy_intp stride, shape, innerstride = 0, innershape;
+                npy_intp sizeof_axisdata =
+                                    NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+                /* Find stride of the first non-empty shape */
+                for (idim = 0; idim < ndim; ++idim) {
+                    innershape = NAD_SHAPE(axisdata);
+                    if (innershape != 1) {
+                        innerstride = NAD_STRIDES(axisdata)[iiter];
                         break;
                     }
-                    else {
-                        innershape *= shape;
-                    }
+                    NIT_ADVANCE_AXISDATA(axisdata, 1);
                 }
+                ++idim;
                 NIT_ADVANCE_AXISDATA(axisdata, 1);
-            }
-            /*
-             * If we looped all the way to the end, one stride works.
-             * Set that stride, because it may not belong to the first
-             * dimension.
-             */
-            if (idim == ndim) {
-                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
-                NBF_STRIDES(bufferdata)[iiter] = innerstride;
+                /* Check that everything could have coalesced together */
+                for (; idim < ndim; ++idim) {
+                    stride = NAD_STRIDES(axisdata)[iiter];
+                    shape = NAD_SHAPE(axisdata);
+                    if (shape != 1) {
+                        /*
+                         * If N times the inner stride doesn't equal this
+                         * stride, the multi-dimensionality is needed.
+                         */
+                        if (innerstride*innershape != stride) {
+                            break;
+                        }
+                        else {
+                            innershape *= shape;
+                        }
+                    }
+                    NIT_ADVANCE_AXISDATA(axisdata, 1);
+                }
+                /*
+                 * If we looped all the way to the end, one stride works.
+                 * Set that stride, because it may not belong to the first
+                 * dimension.
+                 */
+                if (idim == ndim) {
+                    op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+                    NBF_STRIDES(bufferdata)[iiter] = innerstride;
+                }
             }
         }
     }
@@ -4771,6 +4834,7 @@ npyiter_get_common_dtype(npy_intp niter, PyArrayObject **op,
     npy_intp narrs = 0, ndtypes = 0;
     PyArrayObject *arrs[NPY_MAXARGS];
     PyArray_Descr *dtypes[NPY_MAXARGS];
+    PyArray_Descr *ret;
 
     NPY_IT_DBG_PRINTF("Iterator: Getting a common data type from operands\n");
 
@@ -4790,7 +4854,30 @@ npyiter_get_common_dtype(npy_intp niter, PyArrayObject **op,
         }
     }
 
-    return PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
+    if (narrs == 0) {
+        npy_intp i;
+        ret = dtypes[0];
+        for (i = 1; i < ndtypes; ++i) {
+            if (ret != dtypes[i])
+                break;
+        }
+        if (i == ndtypes) {
+            if (ndtypes == 1 || PyArray_ISNBO(ret->byteorder)) {
+                Py_INCREF(ret);
+            }
+            else {
+                ret = PyArray_DescrNewByteorder(ret, NPY_NATIVE);
+            }
+        }
+        else {
+            ret = PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
+        }
+    }
+    else {
+        ret = PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
+    }
+
+    return ret;
 }
 
 static int