From 0bf907ac5ed0d3c3917cd55d0511633ea8feb8f3 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Thu, 8 Oct 2015 17:33:02 -0400 Subject: [PATCH 1/5] BUG: Fix numerous bugs related to zero-width string arrays (#473, #1901, #2196, #2585, #4955) PyArray_NewFromDescr normally does not allow zero-width string dtypes (or rather, it automatically converts them to 1-width strings). This affects any code that uses PyArray_NewFromDescr, which is a lot. So we extend PyArray_NewFromDescr_int to allow disabling this functionality in a couple narrow cases where it's appropriate--one is when extracting a field from a structured array that has a zero-width string dtype (which, intentionally or not, has been allowed). The other, which is related, is returning a view of an array that has a zero-width string dtype. This shouldn't otherwise break or change any existing behavior. Remove roadblocks to creating structured dtypes with zero-width fields using dict-based constructors (this was possible by other means such as the list-based constructor--with the previous fix in particular it should not be necessary to block this anymore). Adds tests based on the tests cases given in the issues this is fixing. Fix a bug with array to zero-width array assignment revealed by the tests. I am slightly concerned that the blunt-force check for this in raw_array_assign_array may be masking a bug somewhere else though. --- numpy/core/_internal.py | 2 -- numpy/core/records.py | 2 +- numpy/core/src/multiarray/convert.c | 5 ++-- numpy/core/src/multiarray/ctors.c | 33 ++++++++++++---------- numpy/core/src/multiarray/ctors.h | 6 ++++ numpy/core/src/multiarray/descriptor.c | 2 +- numpy/core/src/multiarray/dtype_transfer.c | 13 +++++---- numpy/core/src/multiarray/mapping.c | 9 +++--- numpy/core/src/multiarray/methods.c | 14 ++++----- numpy/core/tests/test_dtype.py | 10 +++++++ numpy/core/tests/test_multiarray.py | 24 ++++++++++++++++ numpy/core/tests/test_records.py | 14 +++++++++ 12 files changed, 96 insertions(+), 38 deletions(-) diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py index 47c933411123..5ad440fa49b7 100644 --- a/numpy/core/_internal.py +++ b/numpy/core/_internal.py @@ -33,8 +33,6 @@ def _makenames_list(adict, align): if (num < 0): raise ValueError("invalid offset.") format = dtype(obj[0], align=align) - if (format.itemsize == 0): - raise ValueError("all itemsizes must be fixed.") if (n > 2): title = obj[2] else: diff --git a/numpy/core/records.py b/numpy/core/records.py index 9f5dcc8110ca..3bee394cdebd 100644 --- a/numpy/core/records.py +++ b/numpy/core/records.py @@ -424,7 +424,7 @@ def __new__(subtype, shape, dtype=None, buf=None, offset=0, strides=None, return self def __array_finalize__(self, obj): - if self.dtype.type is not record: + if self.dtype.type is not record and self.dtype.fields: # if self.dtype is not np.record, invoke __setattr__ which will # convert it to a record if it is a void dtype. self.dtype = self.dtype diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c index 5499160be51d..8d51a6dd2d36 100644 --- a/numpy/core/src/multiarray/convert.c +++ b/numpy/core/src/multiarray/convert.c @@ -14,6 +14,7 @@ #include "npy_pycompat.h" #include "arrayobject.h" +#include "ctors.h" #include "mapping.h" #include "lowlevel_strided_loops.h" #include "scalartypes.h" @@ -600,13 +601,13 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype) dtype = PyArray_DESCR(self); Py_INCREF(dtype); - ret = (PyArrayObject *)PyArray_NewFromDescr(subtype, + ret = (PyArrayObject *)PyArray_NewFromDescr_int(subtype, dtype, PyArray_NDIM(self), PyArray_DIMS(self), PyArray_STRIDES(self), PyArray_DATA(self), flags, - (PyObject *)self); + (PyObject *)self, 0, 1); if (ret == NULL) { Py_XDECREF(type); return NULL; diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index 0017de0ad6e7..777121bb85a9 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -894,10 +894,11 @@ discover_dimensions(PyObject *obj, int *maxndim, npy_intp *d, int check_it, * * steals a reference to descr (even on failure) */ -static PyObject * +NPY_NO_EXPORT PyObject * PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd, npy_intp *dims, npy_intp *strides, void *data, - int flags, PyObject *obj, int zeroed) + int flags, PyObject *obj, int zeroed, + int allow_emptystring) { PyArrayObject_fields *fa; int i; @@ -916,7 +917,8 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd, newstrides, nd); ret = PyArray_NewFromDescr_int(subtype, descr, nd, newdims, newstrides, - data, flags, obj, zeroed); + data, flags, obj, zeroed, + allow_emptystring); return ret; } @@ -935,16 +937,17 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd, PyErr_SetString(PyExc_TypeError, "Empty data-type"); Py_DECREF(descr); return NULL; - } - PyArray_DESCR_REPLACE(descr); - if (descr == NULL) { - return NULL; - } - if (descr->type_num == NPY_STRING) { - nbytes = descr->elsize = 1; - } - else { - nbytes = descr->elsize = sizeof(npy_ucs4); + } else if (!allow_emptystring) { + PyArray_DESCR_REPLACE(descr); + if (descr == NULL) { + return NULL; + } + if (descr->type_num == NPY_STRING) { + nbytes = descr->elsize = 1; + } + else { + nbytes = descr->elsize = sizeof(npy_ucs4); + } } } @@ -1134,7 +1137,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd, { return PyArray_NewFromDescr_int(subtype, descr, nd, dims, strides, data, - flags, obj, 0); + flags, obj, 0, 0); } /*NUMPY_API @@ -2855,7 +2858,7 @@ PyArray_Zeros(int nd, npy_intp *dims, PyArray_Descr *type, int is_f_order) type, nd, dims, NULL, NULL, - is_f_order, NULL, 1); + is_f_order, NULL, 1, 0); if (ret == NULL) { return NULL; diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h index 783818deff28..e889910cbef4 100644 --- a/numpy/core/src/multiarray/ctors.h +++ b/numpy/core/src/multiarray/ctors.h @@ -6,6 +6,12 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd, npy_intp *dims, npy_intp *strides, void *data, int flags, PyObject *obj); +NPY_NO_EXPORT PyObject * +PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd, + npy_intp *dims, npy_intp *strides, void *data, + int flags, PyObject *obj, int zeroed, + int allow_emptystring); + NPY_NO_EXPORT PyObject *PyArray_New(PyTypeObject *, int nd, npy_intp *, int, npy_intp *, void *, int, int, PyObject *); diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c index 7886179f1da8..7f8be5356070 100644 --- a/numpy/core/src/multiarray/descriptor.c +++ b/numpy/core/src/multiarray/descriptor.c @@ -1138,7 +1138,7 @@ _convert_from_dict(PyObject *obj, int align) } } Py_DECREF(tup); - if ((ret == NPY_FAIL) || (newdescr->elsize == 0)) { + if (ret == NPY_FAIL) { goto fail; } dtypeflags |= (newdescr->flags & NPY_FROM_FIELDS); diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c index fd371a1f69d0..d7759b6cc775 100644 --- a/numpy/core/src/multiarray/dtype_transfer.c +++ b/numpy/core/src/multiarray/dtype_transfer.c @@ -22,6 +22,7 @@ #include "npy_pycompat.h" #include "convert_datatype.h" +#include "ctors.h" #include "_datetime.h" #include "datetime_strings.h" @@ -549,8 +550,8 @@ wrap_copy_swap_function(int aligned, * The copyswap functions shouldn't need that. */ Py_INCREF(dtype); - data->arr = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, - 1, &shape, NULL, NULL, 0, NULL); + data->arr = (PyArrayObject *)PyArray_NewFromDescr_int(&PyArray_Type, dtype, + 1, &shape, NULL, NULL, 0, NULL, 0, 1); if (data->arr == NULL) { PyArray_free(data); return NPY_FAIL; @@ -1405,8 +1406,8 @@ get_nbo_cast_transfer_function(int aligned, return NPY_FAIL; } } - data->aip = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, tmp_dtype, - 1, &shape, NULL, NULL, 0, NULL); + data->aip = (PyArrayObject *)PyArray_NewFromDescr_int(&PyArray_Type, + tmp_dtype, 1, &shape, NULL, NULL, 0, NULL, 0, 1); if (data->aip == NULL) { PyArray_free(data); return NPY_FAIL; @@ -1429,8 +1430,8 @@ get_nbo_cast_transfer_function(int aligned, return NPY_FAIL; } } - data->aop = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, tmp_dtype, - 1, &shape, NULL, NULL, 0, NULL); + data->aop = (PyArrayObject *)PyArray_NewFromDescr_int(&PyArray_Type, + tmp_dtype, 1, &shape, NULL, NULL, 0, NULL, 0, 1); if (data->aop == NULL) { Py_DECREF(data->aip); PyArray_free(data); diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c index 5da76a39bef0..3d33f8a85089 100644 --- a/numpy/core/src/multiarray/mapping.c +++ b/numpy/core/src/multiarray/mapping.c @@ -14,6 +14,7 @@ #include "npy_import.h" #include "common.h" +#include "ctors.h" #include "iterators.h" #include "mapping.h" #include "lowlevel_strided_loops.h" @@ -1291,7 +1292,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view) /* view the array at the new offset+dtype */ Py_INCREF(fieldtype); - *view = (PyArrayObject*)PyArray_NewFromDescr( + *view = (PyArrayObject*)PyArray_NewFromDescr_int( Py_TYPE(arr), fieldtype, PyArray_NDIM(arr), @@ -1299,7 +1300,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view) PyArray_STRIDES(arr), PyArray_BYTES(arr) + offset, PyArray_FLAGS(arr), - (PyObject *)arr); + (PyObject *)arr, 0, 1); if (*view == NULL) { return 0; } @@ -1397,7 +1398,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view) view_dtype->fields = fields; view_dtype->flags = PyArray_DESCR(arr)->flags; - *view = (PyArrayObject*)PyArray_NewFromDescr( + *view = (PyArrayObject*)PyArray_NewFromDescr_int( Py_TYPE(arr), view_dtype, PyArray_NDIM(arr), @@ -1405,7 +1406,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view) PyArray_STRIDES(arr), PyArray_DATA(arr), PyArray_FLAGS(arr), - (PyObject *)arr); + (PyObject *)arr, 0, 1); if (*view == NULL) { return 0; } diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c index b8e066b790ef..6346906489de 100644 --- a/numpy/core/src/multiarray/methods.c +++ b/numpy/core/src/multiarray/methods.c @@ -379,13 +379,13 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset) Py_DECREF(safe); } - ret = PyArray_NewFromDescr(Py_TYPE(self), - typed, - PyArray_NDIM(self), PyArray_DIMS(self), - PyArray_STRIDES(self), - PyArray_BYTES(self) + offset, - PyArray_FLAGS(self)&(~NPY_ARRAY_F_CONTIGUOUS), - (PyObject *)self); + ret = PyArray_NewFromDescr_int(Py_TYPE(self), + typed, + PyArray_NDIM(self), PyArray_DIMS(self), + PyArray_STRIDES(self), + PyArray_BYTES(self) + offset, + PyArray_FLAGS(self)&(~NPY_ARRAY_F_CONTIGUOUS), + (PyObject *)self, 0, 1); if (ret == NULL) { return NULL; } diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py index a6cb66b7d507..f0721d7a3292 100644 --- a/numpy/core/tests/test_dtype.py +++ b/numpy/core/tests/test_dtype.py @@ -256,6 +256,16 @@ def test_from_dictproxy(self): dt2 = np.dtype((np.void, dt.fields)) assert_equal(dt2.fields, dt.fields) + def test_from_dict_with_zero_width_field(self): + # Regression test for #6430 / #2196 + dt = np.dtype([('val1', np.float32, (0,)), ('val2', int)]) + dt2 = np.dtype({'names': ['val1', 'val2'], + 'formats': [(np.float32, (0,)), int]}) + + assert_dtype_equal(dt, dt2) + assert_equal(dt.fields['val1'][0].itemsize, 0) + assert_equal(dt.itemsize, dt.fields['val2'][0].itemsize) + def test_bool_commastring(self): d = np.dtype('?,?,?') # raises? assert_equal(len(d.names), 3) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index ec2b9fdb33be..faca404ae7dc 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -923,6 +923,30 @@ def testassign(): assert_raises(ValueError, testassign) + def test_zero_width_string(self): + # Test for PR #6430 / issues #473, #4955, #2585 + + dt = np.dtype([('I', int), ('S', 'S0')]) + + x = np.zeros(3, dtype=dt) + + assert_equal(x['S'], [b'', b'', b'']) + assert_equal(x['S'].itemsize, 0) + + x['S'] = ['a', 'b', 'c'] + assert_equal(x['S'], [b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0]) + + # Variation on test case from #4955 + x['S'][x['I'] == 0] = 'hello' + assert_equal(x['S'], [b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0]) + + # Variation on test case from #2585 + x['S'] = 'A' + assert_equal(x['S'], [b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0]) + class TestBool(TestCase): def test_test_interning(self): diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py index 2c85546a762f..6ef6badda89c 100644 --- a/numpy/core/tests/test_records.py +++ b/numpy/core/tests/test_records.py @@ -254,6 +254,20 @@ def test_recarray_returntypes(self): assert_equal(a[0]['qux'].D, asbytes('fgehi')) assert_equal(a[0]['qux']['D'], asbytes('fgehi')) + def test_zero_width_strings(self): + # Test for #6430, based on the test case from #1901 + + cols = [['test'] * 3, [''] * 3] + rec = np.rec.fromarrays(cols) + assert_equal(rec['f0'], ['test', 'test', 'test']) + assert_equal(rec['f1'], ['', '', '']) + + dt = np.dtype([('f0', '|S4'), ('f1', '|S')]) + rec = np.rec.fromarrays(cols, dtype=dt) + assert_equal(rec.itemsize, 4) + assert_equal(rec['f0'], [b'test', b'test', b'test']) + assert_equal(rec['f1'], [b'', b'', b'']) + class TestRecord(TestCase): def setUp(self): From 7a50050386cf05b94c16ba5e8676a68a23cbd6b8 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 6 May 2016 11:32:05 +0200 Subject: [PATCH 2/5] BUG: Fix indexing a nested structured type that contains a zero-width structure (i.e. with underlying dtype V0 as opposed to S0). Fix bug with reshape of zero-width array. See #6430 --- numpy/core/src/multiarray/ctors.c | 4 ++-- numpy/core/src/multiarray/shape.c | 4 ++-- numpy/core/tests/test_multiarray.py | 28 +++++++++++++++++++--------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index 777121bb85a9..1ae03c751df9 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -933,11 +933,11 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd, /* Check datatype element size */ nbytes = descr->elsize; if (nbytes == 0) { - if (!PyDataType_ISSTRING(descr)) { + if (!PyDataType_ISFLEXIBLE(descr)) { PyErr_SetString(PyExc_TypeError, "Empty data-type"); Py_DECREF(descr); return NULL; - } else if (!allow_emptystring) { + } else if (PyDataType_ISSTRING(descr) && !allow_emptystring) { PyArray_DESCR_REPLACE(descr); if (descr == NULL) { return NULL; diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c index cc02d15f9caa..30fd45f332d0 100644 --- a/numpy/core/src/multiarray/shape.c +++ b/numpy/core/src/multiarray/shape.c @@ -255,12 +255,12 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims, } Py_INCREF(PyArray_DESCR(self)); - ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self), + ret = (PyArrayObject *)PyArray_NewFromDescr_int(Py_TYPE(self), PyArray_DESCR(self), ndim, dimensions, strides, PyArray_DATA(self), - flags, (PyObject *)self); + flags, (PyObject *)self, 0, 1); if (ret == NULL) { goto fail; diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index faca404ae7dc..66f6e23cbd3e 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -928,24 +928,34 @@ def test_zero_width_string(self): dt = np.dtype([('I', int), ('S', 'S0')]) - x = np.zeros(3, dtype=dt) + x = np.zeros(4, dtype=dt) - assert_equal(x['S'], [b'', b'', b'']) + assert_equal(x['S'], [b'', b'', b'', b'']) assert_equal(x['S'].itemsize, 0) - x['S'] = ['a', 'b', 'c'] - assert_equal(x['S'], [b'', b'', b'']) - assert_equal(x['I'], [0, 0, 0]) + x['S'] = ['a', 'b', 'c', 'd'] + assert_equal(x['S'], [b'', b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0, 0]) # Variation on test case from #4955 x['S'][x['I'] == 0] = 'hello' - assert_equal(x['S'], [b'', b'', b'']) - assert_equal(x['I'], [0, 0, 0]) + assert_equal(x['S'], [b'', b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0, 0]) # Variation on test case from #2585 x['S'] = 'A' - assert_equal(x['S'], [b'', b'', b'']) - assert_equal(x['I'], [0, 0, 0]) + assert_equal(x['S'], [b'', b'', b'', b'']) + assert_equal(x['I'], [0, 0, 0, 0]) + + # More tests for indexing an array with zero-width fields + assert_equal(np.zeros(4, dtype=[('a', 'S0,S0'), + ('b', 'u1')])['a'].itemsize, 0) + assert_equal(np.empty(3, dtype='S0,S0').itemsize, 0) + assert_equal(np.zeros(4, dtype='S0,u1')['f0'].itemsize, 0) + + xx = x['S'].reshape((2, 2)) + assert_equal(xx.itemsize, 0) + assert_equal(xx, [[b'', b''], [b'', b'']]) class TestBool(TestCase): From db7c0b74d3d85521769042ff91309baeb57aae0d Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 6 May 2016 13:20:32 +0200 Subject: [PATCH 3/5] BUG: Change ndarray.__new__ to allow zero-width data types, and in particular without converting zero-width strings to a one-width string dtype. ndarray.__new__, being relatively low-level, should not be mangling the requested dtype when creating an array. See #6430 --- numpy/core/src/multiarray/arrayobject.c | 24 ++++++++++-------------- numpy/core/tests/test_multiarray.py | 5 +++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c index 277c6517ca01..c21bcaecf664 100644 --- a/numpy/core/src/multiarray/arrayobject.c +++ b/numpy/core/src/multiarray/arrayobject.c @@ -1652,11 +1652,6 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) } itemsize = descr->elsize; - if (itemsize == 0) { - PyErr_SetString(PyExc_ValueError, - "data-type with unspecified variable length"); - goto fail; - } if (strides.ptr != NULL) { npy_intp nb, off; @@ -1690,10 +1685,11 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) if (buffer.ptr == NULL) { ret = (PyArrayObject *) - PyArray_NewFromDescr(subtype, descr, - (int)dims.len, - dims.ptr, - strides.ptr, NULL, is_f_order, NULL); + PyArray_NewFromDescr_int(subtype, descr, + (int)dims.len, + dims.ptr, + strides.ptr, NULL, is_f_order, NULL, + 0, 1); if (ret == NULL) { descr = NULL; goto fail; @@ -1726,11 +1722,11 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) buffer.flags |= NPY_ARRAY_F_CONTIGUOUS; } ret = (PyArrayObject *)\ - PyArray_NewFromDescr(subtype, descr, - dims.len, dims.ptr, - strides.ptr, - offset + (char *)buffer.ptr, - buffer.flags, NULL); + PyArray_NewFromDescr_int(subtype, descr, + dims.len, dims.ptr, + strides.ptr, + offset + (char *)buffer.ptr, + buffer.flags, NULL, 0, 1); if (ret == NULL) { descr = NULL; goto fail; diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 66f6e23cbd3e..9d7c53e67d37 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -947,6 +947,11 @@ def test_zero_width_string(self): assert_equal(x['S'], [b'', b'', b'', b'']) assert_equal(x['I'], [0, 0, 0, 0]) + # Allow zero-width dtypes in ndarray constructor + y = np.ndarray(4, dtype=x['S'].dtype) + assert_equal(y.itemsize, 0) + assert_equal(x['S'], y) + # More tests for indexing an array with zero-width fields assert_equal(np.zeros(4, dtype=[('a', 'S0,S0'), ('b', 'u1')])['a'].itemsize, 0) From ee02cdda6c8d135098baa1d5afe41fd4996d587c Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 6 May 2016 13:21:53 +0200 Subject: [PATCH 4/5] BUG: Fixes to reading and writing of empty arrays, and in particular arrays with empty dtypes. See #6430 --- numpy/core/src/multiarray/convert.c | 4 ++++ numpy/core/src/multiarray/ctors.c | 10 ++++++---- numpy/core/tests/test_multiarray.py | 18 +++++++++++++++-- numpy/lib/format.py | 31 +++++++++++++++++++---------- numpy/lib/npyio.py | 4 +++- 5 files changed, 49 insertions(+), 18 deletions(-) diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c index 8d51a6dd2d36..8e0c666325f3 100644 --- a/numpy/core/src/multiarray/convert.c +++ b/numpy/core/src/multiarray/convert.c @@ -133,6 +133,10 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format) "cannot write object arrays to a file in binary mode"); return -1; } + if (PyArray_DESCR(self)->elsize == 0) { + /* For zero-width data types there's nothing to write */ + return 0; + } if (npy_fallocate(PyArray_NBYTES(self), fp) != 0) { return -1; } diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index 1ae03c751df9..bf060e8a5bce 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -3391,10 +3391,12 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep) return NULL; } if (dtype->elsize == 0) { - PyErr_SetString(PyExc_ValueError, - "The elements are 0-sized."); - Py_DECREF(dtype); - return NULL; + /* Nothing to read, just create an empty array of the requested type */ + return PyArray_NewFromDescr_int(&PyArray_Type, + dtype, + 1, &num, + NULL, NULL, + 0, NULL, 0, 1); } if ((sep == NULL) || (strlen(sep) == 0)) { ret = array_fromfile_binary(fp, dtype, num, &nread); diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 9d7c53e67d37..ca13a172f926 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -23,13 +23,13 @@ from numpy.core.multiarray_tests import ( test_neighborhood_iterator, test_neighborhood_iterator_oob, test_pydatamem_seteventhook_start, test_pydatamem_seteventhook_end, - test_inplace_increment, get_buffer_info, test_as_c_array + test_inplace_increment, get_buffer_info, test_as_c_array, ) from numpy.testing import ( TestCase, run_module_suite, assert_, assert_raises, assert_equal, assert_almost_equal, assert_array_equal, assert_array_almost_equal, assert_allclose, - assert_array_less, runstring, dec, SkipTest + assert_array_less, runstring, dec, SkipTest, temppath ) # Need to test an object that does not fully implement math interface @@ -962,6 +962,20 @@ def test_zero_width_string(self): assert_equal(xx.itemsize, 0) assert_equal(xx, [[b'', b''], [b'', b'']]) + b = io.BytesIO() + np.save(b, xx) + + b.seek(0) + yy = np.load(b) + assert_equal(yy.itemsize, 0) + assert_equal(xx, yy) + + with temppath(suffix='.npy') as tmp: + np.save(tmp, xx) + yy = np.load(tmp) + assert_equal(yy.itemsize, 0) + assert_equal(xx, yy) + class TestBool(TestCase): def test_test_interning(self): diff --git a/numpy/lib/format.py b/numpy/lib/format.py index cfe0e62acf33..e62677bd86eb 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -558,8 +558,11 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): warnings.warn("Stored array in format 2.0. It can only be" "read by NumPy >= 1.9", UserWarning) - # Set buffer size to 16 MiB to hide the Python loop overhead. - buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) + if array.itemsize == 0: + buffersize = 0 + else: + # Set buffer size to 16 MiB to hide the Python loop overhead. + buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data @@ -655,15 +658,21 @@ def read_array(fp, allow_pickle=True, pickle_kwargs=None): # of the read. In non-chunked case count < max_read_count, so # only one read is performed. - max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) - - array = numpy.empty(count, dtype=dtype) - for i in range(0, count, max_read_count): - read_count = min(max_read_count, count - i) - read_size = int(read_count * dtype.itemsize) - data = _read_bytes(fp, read_size, "array data") - array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, - count=read_count) + # Use np.ndarray instead of np.empty since the latter does + # not correctly instantiate zero-width string dtypes; see + # https://github.com/numpy/numpy/pull/6430 + array = numpy.ndarray(count, dtype=dtype) + + if dtype.itemsize > 0: + # If dtype.itemsize == 0 then there's nothing more to read + max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) + + for i in range(0, count, max_read_count): + read_count = min(max_read_count, count - i) + read_size = int(read_count * dtype.itemsize) + data = _read_bytes(fp, read_size, "array data") + array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, + count=read_count) if fortran_order: array.shape = shape[::-1] diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 4b6770483e34..179dbd8dad9d 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -399,7 +399,9 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, _ZIP_PREFIX = asbytes('PK\x03\x04') N = len(format.MAGIC_PREFIX) magic = fid.read(N) - fid.seek(-N, 1) # back-up + # If the file size is less than N, we need to make sure not + # to seek past the beginning of the file + fid.seek(-min(N, len(magic)), 1) # back-up if magic.startswith(_ZIP_PREFIX): # zip-file (assume .npz) # Transfer file ownership to NpzFile From fd95d10f4d7ed8f3cb66e4f7e73c184be13f395e Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 6 May 2016 13:22:05 +0200 Subject: [PATCH 5/5] STY: Misc whitespace cleanup. --- numpy/lib/npyio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 179dbd8dad9d..426f6e8232a6 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -89,7 +89,7 @@ def __dir__(self): def zipfile_factory(file, *args, **kwargs): """ Create a ZipFile. - + Allows for Zip64, and the `file` argument can accept file, str, or pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile constructor. @@ -743,12 +743,12 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Skip the first `skiprows` lines; default: 0. usecols : int or sequence, optional - Which columns to read, with 0 being the first. For example, + Which columns to read, with 0 being the first. For example, usecols = (1,4,5) will extract the 2nd, 5th and 6th columns. The default, None, results in all columns being read. - + .. versionadded:: 1.11.0 - + Also when a single column has to be read it is possible to use an integer instead of a tuple. E.g ``usecols = 3`` reads the third column the same way as `usecols = (3,)`` would.