Skip to content

Commit c2080f2

Browse files
committed
FIX: raise error on varying number of separators in Array.split_axes (closes #1089)
1 parent e8bec94 commit c2080f2

File tree

3 files changed

+34
-5
lines changed

3 files changed

+34
-5
lines changed

doc/source/changes/version_0_34_2.rst.inc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,7 @@ Fixes
5858
* fixed Array.reindex when using an axis object from the array as `axes_to_reindex` (closes :issue:`1088`).
5959

6060
* fixed Array.reindex({axis: list_of_labels}) (closes :issue:`1068`).
61+
62+
* Array.split_axes now raises an explicit error when some labels contain
63+
more separators than others, instead of silently dropping part of those
64+
labels, or even some data (closes :issue:`1089`).

larray/core/axis.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -382,19 +382,30 @@ def split(self, sep='_', names=None, regex=None, return_labels=False) \
382382
if not regex:
383383
# np.char.split does not work on arrays with object dtype
384384
labels = self.labels if self.labels.dtype.kind != 'O' else self.labels.astype(str)
385-
# gives us an array of lists
385+
# split_labels is an array of lists
386386
split_labels = np.char.split(labels, sep)
387387
else:
388388
match = re.compile(regex).match
389+
# split_labels is a list of tuples
389390
split_labels = [match(label).groups() for label in self.labels]
391+
first_split_label_length = len(split_labels[0])
392+
# TODO: when our lowest supported version will be Python 3.10, we should use
393+
# strict=True instead of checking lengths explicitly
394+
if any(len(split_label) != first_split_label_length
395+
for split_label in split_labels):
396+
raise ValueError("not all labels have the same number of separators")
397+
indexing_labels = tuple(zip(*split_labels))
390398
if names is None:
391-
names = [None] * len(split_labels)
392-
indexing_labels = zip(*split_labels)
393-
if return_labels:
394-
indexing_labels = tuple(indexing_labels)
399+
names = [None] * first_split_label_length
400+
num_axes = len(indexing_labels)
401+
if num_axes != len(names):
402+
raise ValueError(f"number of resulting axes ({num_axes}) differs "
403+
f"from number of resulting axes names "
404+
f"({len(names)})")
395405
# not using np.unique because we want to keep the original order
396406
split_axes = [Axis(unique_list(ax_labels), name) for ax_labels, name in zip(indexing_labels, names)]
397407
if return_labels:
408+
assert len(split_axes) == num_axes
398409
indexing_labels = tuple(axis[labels] for axis, labels in zip(split_axes, indexing_labels))
399410
return split_axes, indexing_labels
400411
else:

larray/tests/test_array.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5497,6 +5497,20 @@ def test_split_axes():
54975497
# when no axis is specified and no axis contains the sep, split_axes is a no-op.
54985498
assert_larray_equal(combined.split_axes(), combined)
54995499

5500+
# with varying sep characters in labels (issue #1089)
5501+
arr = ndtest("a_b=a0_b0,a0_b1_1,a0_b1_2")
5502+
# a_b a0_b0 a0_b1_1 a0_b1_2
5503+
# 0 1 2
5504+
with must_raise(ValueError, "not all labels have the same number of separators"):
5505+
arr.split_axes()
5506+
5507+
# with different number of sep characters in labels than in axis name
5508+
arr = ndtest("a_b=a0_b0_1,a0_b1_1,a0_b1_2")
5509+
# a_b a0_b0_1 a0_b1_1 a0_b1_2
5510+
# 0 1 2
5511+
with must_raise(ValueError, "number of resulting axes (3) differs from number of resulting axes names (2)"):
5512+
arr.split_axes()
5513+
55005514

55015515
def test_stack():
55025516
# stack along a single axis

0 commit comments

Comments
 (0)