From 2a32a395dc89c9eb17baadbd9fd56c2d14408ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 16:59:47 +0200 Subject: [PATCH 01/44] fixed arr1d @ arr1d returning a 0D LArray instead of a scalar --- larray/core/array.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/larray/core/array.py b/larray/core/array.py index d74826374..c1ca45c3c 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -5081,7 +5081,10 @@ def __matmul__(self, other): res_axes += [axes[-2]] if other.ndim > 1: res_axes += [other_axes[-1].copy()] - return LArray(res_data, res_axes) + if res_axes: + return LArray(res_data, res_axes) + else: + return res_data def __rmatmul__(self, other): if isinstance(other, np.ndarray): From ca76975aeb9671af2cc1bd2347849ad73a675a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 15:52:58 +0200 Subject: [PATCH 02/44] fixed Axis(values, name=np.str_) --- larray/core/axis.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index be97a1224..0021b29b7 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -92,8 +92,11 @@ def __init__(self, labels, name=None): # make sure we do not have np.str_ as it causes problems down the # line with xlwings. Cannot use isinstance to check that though. - is_python_str = type(name) is unicode or type(name) is bytes - assert name is None or isinstance(name, int) or is_python_str, type(name) + name_is_python_str = type(name) is unicode or type(name) is bytes + if isinstance(name, str) and not name_is_python_str: + name = str(name) + if name is not None and not isinstance(name, (int, str)): + raise TypeError("Axis name should be None, int or str but is: %s (%s)" % (name, type(name).__name__)) self.name = name self._labels = None self.__mapping = None From b39c87b4634d3c0deba452dc166fac003a3ee893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 21 Nov 2018 09:53:34 +0100 Subject: [PATCH 03/44] WIP: fixed loading pd.Dataframe with non string index names (needs test & changelog) --- larray/inout/pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py index 017c400a4..c40801977 100644 --- a/larray/inout/pandas.py +++ b/larray/inout/pandas.py @@ -211,7 +211,8 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo a1 b0 4 5 a1 b1 6 7 """ - axes_names = [decode(name, 'utf8') for name in df.index.names] + axes_names = [decode(name, 'utf8') if isinstance(name, basestring) else name + for name in df.index.names] # handle 2 or more dimensions with the last axis name given using \ if unfold_last_axis_name: @@ -303,7 +304,8 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header raise ValueError('sort_rows=True is not valid for 1D arrays. Please use sort_columns instead.') return from_series(series, sort_rows=sort_columns) else: - axes_names = [decode(name, 'utf8') for name in df.index.names] + axes_names = [decode(name, 'utf8') if isinstance(name, basestring) else name + for name in df.index.names] unfold_last_axis_name = isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1] return from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns, parse_header=parse_header, unfold_last_axis_name=unfold_last_axis_name, **kwargs) From c7ddd6a3a47d9d1f473b7d1074c602880accc101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 14 Dec 2018 08:12:31 +0100 Subject: [PATCH 04/44] WIP: fixed LArray.broadcast_with with out= (needs test) --- larray/core/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/larray/core/array.py b/larray/core/array.py index c1ca45c3c..04012789a 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -5509,6 +5509,8 @@ def expand(self, target_axes=None, out=None, readonly=False): return LArray(np.broadcast_to(broadcasted, target_axes.shape), target_axes) else: out = empty(target_axes, dtype=self.dtype) + else: + broadcasted = self out[:] = broadcasted return out From afc7ebb4cedfec776c17aca59701830d4d915cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 14 Nov 2018 10:48:21 +0100 Subject: [PATCH 05/44] fixed many warning messages when running the test suite (invalid escape) used raw strings in a lot of place, especially in regex --- larray/core/array.py | 94 +++++++++++++++++++------------------- larray/core/axis.py | 2 +- larray/core/group.py | 8 ++-- larray/tests/test_array.py | 26 +++++------ 4 files changed, 65 insertions(+), 65 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 04012789a..e260c6201 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -401,7 +401,7 @@ def __setitem__(self, key, value): def get_axis(obj, i): - """ + r""" Returns an axis according to its position. Parameters @@ -497,7 +497,7 @@ def _doc_agg_method(func, by=False, long_name='', action_verb='perform', extra_a doc_args = "".join(_arg_agg[arg] for arg in extra_args) doc_kwargs = "".join(_kwarg_agg[kw]['doc'] for kw in kwargs) - doc_varargs = """ + doc_varargs = r""" \*axes_and_groups : None or int or str or Axis or Group or any combination of those {specific} The default (no axis or group) is to {action_verb} the {long_name} over all the dimensions of the input @@ -1193,7 +1193,7 @@ def describe(self, *args, **kwargs): [self.percentile(p, *args) for p in percentiles], Axis(labels, 'statistic')) def describe_by(self, *args, **kwargs): - """ + r""" Descriptive summary statistics, excluding NaN values, along axes or for groups. By default, it includes the number of non-NaN values, the mean, standard deviation, minimum, maximum and @@ -1465,7 +1465,7 @@ def get_labels(self_axis): return res def align(self, other, join='outer', fill_value=nan, axes=None): - """Align two arrays on their axes with the specified join method. + r"""Align two arrays on their axes with the specified join method. In other words, it ensure all common axes are compatible. Those arrays can then be used in binary operations. @@ -1500,14 +1500,14 @@ def align(self, other, join='outer', fill_value=nan, axes=None): -------- >>> arr1 = ndtest((2, 3)) >>> arr1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr2 = -ndtest((3, 2)) >>> # reorder array to make the test more interesting >>> arr2 = arr2[['b1', 'b0']] >>> arr2 - a\\b b1 b0 + a\b b1 b0 a0 -1 0 a1 -3 -2 a2 -5 -4 @@ -1516,12 +1516,12 @@ def align(self, other, join='outer', fill_value=nan, axes=None): >>> aligned1, aligned2 = arr1.align(arr2) >>> aligned1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 a2 nan nan nan >>> aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan a2 -4.0 -5.0 nan @@ -1529,7 +1529,7 @@ def align(self, other, join='outer', fill_value=nan, axes=None): After aligning all common axes, one can then do operations between the two arrays >>> aligned1 + aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 0.0 nan a1 1.0 1.0 nan a2 nan nan nan @@ -1538,30 +1538,30 @@ def align(self, other, join='outer', fill_value=nan, axes=None): >>> aligned1, aligned2 = arr1.align(arr2, join='inner') >>> aligned1 - a\\b b0 b1 + a\b b0 b1 a0 0.0 1.0 a1 3.0 4.0 >>> aligned2 - a\\b b0 b1 + a\b b0 b1 a0 0.0 -1.0 a1 -2.0 -3.0 >>> aligned1, aligned2 = arr1.align(arr2, join='left') >>> aligned1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 >>> aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan >>> aligned1, aligned2 = arr1.align(arr2, join='right') >>> aligned1 - a\\b b1 b0 + a\b b1 b0 a0 1.0 0.0 a1 4.0 3.0 a2 nan nan >>> aligned2 - a\\b b1 b0 + a\b b1 b0 a0 -1.0 0.0 a1 -3.0 -2.0 a2 -5.0 -4.0 @@ -1570,17 +1570,17 @@ def align(self, other, join='outer', fill_value=nan, axes=None): >>> aligned1, aligned2 = arr1.align(arr2, fill_value=0) >>> aligned1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0 1 2 a1 3 4 5 a2 0 0 0 >>> aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0 -1 0 a1 -2 -3 0 a2 -4 -5 0 >>> aligned1 + aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0 0 2 a1 1 1 5 a2 -4 -5 0 @@ -1589,11 +1589,11 @@ def align(self, other, join='outer', fill_value=nan, axes=None): >>> arr3 = ndtest((3, 2, 2)) >>> arr1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0 1 2 a1 3 4 5 >>> arr3 - a b\\c c0 c1 + a b\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 @@ -1602,7 +1602,7 @@ def align(self, other, join='outer', fill_value=nan, axes=None): a2 b1 10 11 >>> aligned1, aligned2 = arr1.align(arr3, join='inner') >>> aligned1 - a\\b b0 b1 + a\b b0 b1 a0 0.0 1.0 a1 3.0 4.0 >>> aligned2 @@ -1612,7 +1612,7 @@ def align(self, other, join='outer', fill_value=nan, axes=None): a1 b0 4.0 5.0 a1 b1 6.0 7.0 >>> aligned1 + aligned2 - a b\\c c0 c1 + a b\c c0 c1 a0 b0 0.0 1.0 a0 b1 3.0 4.0 a1 b0 7.0 8.0 @@ -1622,11 +1622,11 @@ def align(self, other, join='outer', fill_value=nan, axes=None): >>> aligned1, aligned2 = arr1.align(arr2, axes='b') >>> aligned1 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 1.0 2.0 a1 3.0 4.0 5.0 >>> aligned2 - a\\b b0 b1 b2 + a\b b0 b1 b2 a0 0.0 -1.0 nan a1 -2.0 -3.0 nan a2 -4.0 -5.0 nan @@ -5515,7 +5515,7 @@ def expand(self, target_axes=None, out=None, readonly=False): return out def append(self, axis, value, label=None): - """Adds an array to self along an axis. + r"""Adds an array to self along an axis. The two arrays must have compatible axes. @@ -5537,15 +5537,15 @@ def append(self, axis, value, label=None): -------- >>> a = ones('nat=BE,FO;sex=M,F') >>> a - nat\\sex M F + nat\sex M F BE 1.0 1.0 FO 1.0 1.0 >>> a.append('sex', a.sum('sex'), 'M+F') - nat\\sex M F M+F + nat\sex M F M+F BE 1.0 1.0 2.0 FO 1.0 1.0 2.0 >>> a.append('nat', 2, 'Other') - nat\\sex M F + nat\sex M F BE 1.0 1.0 FO 1.0 1.0 Other 2.0 2.0 @@ -5554,7 +5554,7 @@ def append(self, axis, value, label=None): type type1 type2 0.0 0.0 >>> a.append('nat', b, 'Other') - nat sex\\type type1 type2 + nat sex\type type1 type2 BE M 1.0 1.0 BE F 1.0 1.0 FO M 1.0 1.0 @@ -5566,7 +5566,7 @@ def append(self, axis, value, label=None): return self.insert(value, before=IGroup(len(axis), axis=axis), label=label) def prepend(self, axis, value, label=None): - """Adds an array before self along an axis. + r"""Adds an array before self along an axis. The two arrays must have compatible axes. @@ -5592,11 +5592,11 @@ def prepend(self, axis, value, label=None): BE 1.0 1.0 FO 1.0 1.0 >>> a.prepend('sex', a.sum('sex'), 'M+F') - nat\\sex M+F M F + nat\sex M+F M F BE 2.0 1.0 1.0 FO 2.0 1.0 1.0 >>> a.prepend('nat', 2, 'Other') - nat\\sex M F + nat\sex M F Other 2.0 2.0 BE 1.0 1.0 FO 1.0 1.0 @@ -5605,7 +5605,7 @@ def prepend(self, axis, value, label=None): type type1 type2 0.0 0.0 >>> a.prepend('sex', b, 'Other') - nat sex\\type type1 type2 + nat sex\type type1 type2 BE Other 0.0 0.0 BE M 1.0 1.0 BE F 1.0 1.0 @@ -6318,7 +6318,7 @@ def to_clipboard(self, *args, **kwargs): @property def plot(self): - """Plots the data of the array into a graph (window pop-up). + r"""Plots the data of the array into a graph (window pop-up). The graph can be tweaked to achieve the desired formatting and can be saved to a .png file. @@ -7137,7 +7137,7 @@ def larray_nan_equal(a1, a2): def aslarray(a, meta=None): - """ + r""" Converts input as LArray if possible. Parameters @@ -7201,7 +7201,7 @@ def wrapper(*args, **kwargs): @_check_axes_argument def zeros(axes, title=None, dtype=float, order='C', meta=None): - """Returns an array with the specified axes and filled with zeros. + r"""Returns an array with the specified axes and filled with zeros. Parameters ---------- @@ -7516,7 +7516,7 @@ def full_like(array, fill_value, title=None, dtype=None, order='K', meta=None): meta = _handle_deprecated_argument_title(meta, title) # cannot use full() because order == 'K' is not understood # cannot use np.full_like() because it would not handle LArray fill_value - res = empty_like(array, dtype, meta=meta) + res = empty_like(array, dtype=dtype, meta=meta) res[:] = fill_value return res @@ -8122,7 +8122,7 @@ def eye(rows, columns=None, k=0, title=None, dtype=None, meta=None): def stack(elements=None, axis=None, title=None, meta=None, **kwargs): - """ + r""" Combines several arrays or sessions along an axis. Parameters @@ -8163,41 +8163,41 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): In the case the axis to create has already been defined in a variable (Axis or Group) >>> stack({'BE': arr1, 'FO': arr2}, nat) - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 >>> all_nat = Axis('nat=BE,DE,FR,NL,UK') >>> stack({'BE': arr1, 'DE': arr2}, all_nat[:'DE']) - sex\\nat BE DE + sex\nat BE DE M 1.0 0.0 F 1.0 0.0 Otherwise (when one wants to create an axis from scratch), any of these syntaxes works: >>> stack([arr1, arr2], 'nat=BE,FO') - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 >>> stack({'BE': arr1, 'FO': arr2}, 'nat=BE,FO') - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 >>> stack([('BE', arr1), ('FO', arr2)], 'nat=BE,FO') - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 When stacking arrays with different axes, the result has the union of all axes present: >>> stack({'BE': arr1, 'FO': 0}, nat) - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 Creating an axis without name nor labels can be done using: >>> stack((arr1, arr2)) - sex\\{1}* 0 1 + sex\{1}* 0 1 M 1.0 0.0 F 1.0 0.0 @@ -8205,7 +8205,7 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): arguments can be an attractive alternative. >>> stack(FO=arr2, BE=arr1, axis=nat) - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 @@ -8214,7 +8214,7 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): >>> # use this only on Python 3.6 and later >>> stack(BE=arr1, FO=arr2, axis='nat') # doctest: +SKIP - sex\\nat BE FO + sex\nat BE FO M 1.0 0.0 F 1.0 0.0 diff --git a/larray/core/axis.py b/larray/core/axis.py index 0021b29b7..7b636a00c 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2648,7 +2648,7 @@ def names(self): @property def display_names(self): - """ + r""" Returns the list of (display) names of the axes. Returns diff --git a/larray/core/group.py b/larray/core/group.py index 8c95d07ed..2e6d703e1 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -196,7 +196,7 @@ def generalized_range(start, stop, step=1): return irange(start, stop, step) -_range_str_pattern = re.compile('(?P[^\s.]+)?\s*\.\.\s*(?P[^\s.]+)?(\s+step\s+(?P\d+))?') +_range_str_pattern = re.compile(r'(?P[^\s.]+)?\s*\.\.\s*(?P[^\s.]+)?(\s+step\s+(?P\d+))?') def _range_str_to_range(s, stack_depth=1): @@ -435,7 +435,7 @@ def _to_ticks(s, parse_single_int=False): return np.asarray(ticks) -_axis_name_pattern = re.compile('\s*(([A-Za-z0-9]\w*)(\.i)?\s*\[)?(.*)') +_axis_name_pattern = re.compile(r'\s*(([A-Za-z0-9]\w*)(\.i)?\s*\[)?(.*)') def _seq_str_to_seq(s, stack_depth=1, parse_single_int=False): @@ -645,7 +645,7 @@ def _to_keys(value, stack_depth=1): # forbidden characters in sheet names -_sheet_name_pattern = re.compile('[\\\/?*\[\]:]') +_sheet_name_pattern = re.compile(r'[\\/?*\[\]:]') def _translate_sheet_name(sheet_name): @@ -659,7 +659,7 @@ def _translate_sheet_name(sheet_name): # forbidden characters for dataset names in HDF files -_key_hdf_pattern = re.compile('[\\\/]') +_key_hdf_pattern = re.compile(r'[\\/]') def _translate_group_key_hdf(key): diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 652d4637e..c54497c09 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -290,12 +290,12 @@ def test_str(small_array, array): 0 1 2""" # two dimensions assert str(small_array.filter(lipro=lipro3)) == """\ -sex\lipro P01 P02 P03 +sex\\lipro P01 P02 P03 M 0 1 2 F 15 16 17""" # four dimensions (too many rows) assert str(array.filter(lipro=lipro3)) == """\ -age geo sex\lipro P01 P02 P03 +age geo sex\\lipro P01 P02 P03 0 A11 M 0.0 1.0 2.0 0 A11 F 15.0 16.0 17.0 0 A12 M 30.0 31.0 32.0 @@ -477,21 +477,21 @@ def test_getitem_guess_axis(array): array[[1, 2], 999] # key with invalid label list (ie list of labels not found on any axis) - with pytest.raises(ValueError, message="\[998, 999\] is not a valid label for any axis"): + with pytest.raises(ValueError, message=r"\[998, 999\] is not a valid label for any axis"): array[[1, 2], [998, 999]] # key with partial invalid list (ie list containing a label not found # on any axis) # FIXME: the message should be the same as for 999, 4 (ie it should NOT mention age). - with pytest.raises(ValueError, message="age\[3, 999\] is not a valid label for any axis"): + with pytest.raises(ValueError, message=r"age\[3, 999\] is not a valid label for any axis"): array[[1, 2], [3, 999]] - with pytest.raises(ValueError, message="\[999, 4\] is not a valid label for any axis"): + with pytest.raises(ValueError, message=r"\[999, 4\] is not a valid label for any axis"): array[[1, 2], [999, 4]] # ambiguous key arr = ndtest("a=l0,l1;b=l1,l2") - with pytest.raises(ValueError, message="l1 is ambiguous \(valid in a, b\)"): + with pytest.raises(ValueError, message=r"l1 is ambiguous \(valid in a, b\)"): arr['l1'] # ambiguous key disambiguated via string @@ -2176,7 +2176,7 @@ def test_sum_with_groups_from_other_axis(small_array): # use a group (from another axis) which is incompatible with the axis of # the same name in the array lipro4 = Axis('lipro=P01,P03,P16') - with pytest.raises(ValueError, message="lipro\['P01', 'P16'\] is not a valid label for any axis"): + with pytest.raises(ValueError, message=r"lipro\['P01', 'P16'\] is not a valid label for any axis"): small_array.sum(lipro4['P01,P16']) @@ -2901,7 +2901,7 @@ def test_hdf_roundtrip(tmpdir, meta): group = a3.c['c0,c2'] >> 'even' a3[group].to_hdf(fpath, group) # group with name containing special characters (replaced by _) - group = a3.c['c0,c2'] >> ':name?with*special/\[characters]' + group = a3.c['c0,c2'] >> r':name?with*special/\[characters]' a3[group].to_hdf(fpath, group) # passing group as key to read_hdf @@ -3773,7 +3773,7 @@ def test_to_excel_xlsxwriter(tmpdir): group = a3.c['c0,c2'] >> 'even' a3[group].to_excel(fpath, group, engine='xlsxwriter') # group with name containing special characters (replaced by _) - group = a3.c['c0,c2'] >> ':name?with*special/\[char]' + group = a3.c['c0,c2'] >> r':name?with*special/\[char]' a3[group].to_excel(fpath, group, engine='xlsxwriter') @@ -3863,7 +3863,7 @@ def test_to_excel_xlwings(tmpdir): group = a3.c['c0,c2'] >> 'even' a3[group].to_excel(fpath, group, engine='xlwings') # group with name containing special characters (replaced by _) - group = a3.c['c0,c2'] >> ':name?with*special/\[char]' + group = a3.c['c0,c2'] >> r':name?with*special/\[char]' a3[group].to_excel(fpath, group, engine='xlwings') # checks sheet names sheet_names = sorted(open_excel(fpath).sheet_names()) @@ -4259,7 +4259,7 @@ def test_matmul(): # different axes a1 = ndtest('a=a0..a1;b=b0..b2') a2 = ndtest('b=b0..b2;c=c0..c3') - res = from_lists([['a\c', 'c0', 'c1', 'c2', 'c3'], + res = from_lists([[r'a\c', 'c0', 'c1', 'c2', 'c3'], ['a0', 20, 23, 26, 29], ['a1', 56, 68, 80, 92]]) assert_array_equal(a1.__matmul__(a2), res) @@ -4509,7 +4509,7 @@ def test_split_axes(): assert_array_equal(res.transpose('a', 'b', 'c', 'd'), arr) # regex - res = combined.split_axes('b_d', names=['b', 'd'], regex='(\w+)_(\w+)') + res = combined.split_axes('b_d', names=['b', 'd'], regex=r'(\w+)_(\w+)') assert res.axes.names == ['a', 'b', 'd', 'c'] assert res.shape == (2, 3, 5, 4) assert_array_equal(res.transpose('a', 'b', 'c', 'd'), arr) @@ -4563,7 +4563,7 @@ def test_split_axes(): # using regex arr = ndtest('ab=a0b0..a1b2; c=c0..c3; d=d0..d3; ef=e0f0..e2f1') - res = arr.split_axes({'ab': ('a', 'b'), 'ef': ('e', 'f')}, regex='(\w{2})(\w{2})') + res = arr.split_axes({'ab': ('a', 'b'), 'ef': ('e', 'f')}, regex=r'(\w{2})(\w{2})') assert res.axes.names == ['a', 'b', 'c', 'd', 'e', 'f'] assert res.size == arr.size assert res.shape == (2, 3, 4, 4, 3, 2) From 0922d64565cb230f6cd42d6d477612a8e93fc5f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 21 Feb 2019 15:43:58 +0100 Subject: [PATCH 06/44] WIP: fixed creating an LSet from an IGroup with a scalar key (needs test & changelog) --- larray/core/group.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/larray/core/group.py b/larray/core/group.py index 2e6d703e1..15ce0aa65 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -668,6 +668,7 @@ def _translate_group_key_hdf(key): return key +# TODO: kill this function def union(*args): # TODO: add support for LGroup and lists """ @@ -1613,7 +1614,7 @@ class LSet(LGroup): def __init__(self, key, name=None, axis=None): key = _to_key(key) - if isinstance(key, LGroup): + if isinstance(key, Group): if name is None: name = key.name if axis is None: From 32505d263e220cac6dcf099e74872c27600743e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 4 Oct 2018 14:15:37 +0200 Subject: [PATCH 07/44] avoid a few DeprecationWarnings in tests by using Axis&Group.matching(regex=pattern) instead of matching(pattern) --- larray/core/axis.py | 2 +- larray/tests/test_axis.py | 2 +- larray/tests/test_group.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 7b636a00c..46ab1d79d 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -801,7 +801,7 @@ def index(self, key, bool_passthrough=True): >>> people = Axis(['John Doe', 'Bruce Wayne', 'Bruce Willis', 'Waldo', 'Arthur Dent', 'Harvey Dent'], 'people') >>> people.index('Waldo') 3 - >>> people.index(people.matching('Bruce')) + >>> people.index(people.containing('Bruce')) array([1, 2]) """ mapping = self._mapping diff --git a/larray/tests/test_axis.py b/larray/tests/test_axis.py index 53fcb854c..95af5351d 100644 --- a/larray/tests/test_axis.py +++ b/larray/tests/test_axis.py @@ -291,7 +291,7 @@ def test_init_from_group(): def test_matching(): sutcode = Axis(['A23', 'A2301', 'A25', 'A2501'], 'sutcode') - assert sutcode.matching('^...$') == LGroup(['A23', 'A25']) + assert sutcode.matching(regex='^...$') == LGroup(['A23', 'A25']) assert sutcode.startingwith('A23') == LGroup(['A23', 'A2301']) assert sutcode.endingwith('01') == LGroup(['A2301', 'A2501']) diff --git a/larray/tests/test_group.py b/larray/tests/test_group.py index f24dbc4ce..6cb9ff6c5 100644 --- a/larray/tests/test_group.py +++ b/larray/tests/test_group.py @@ -86,7 +86,7 @@ def test_init_lgroup(lgroups): group = age['1:5'] >> group2 assert group.name == group2.name axis = Axis('axis=a,a0..a3,b,b0..b3,c,c0..c3') - for code in axis.matching('^.$'): + for code in axis.matching(regex='^.$'): group = axis.startingwith(code) >> code assert group.equals(axis.startingwith(code) >> str(code)) From 13790f1b240f41d477bf82a21ac89c5d2efa10b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 17:42:57 +0200 Subject: [PATCH 08/44] added TODO/XXX/FIXME/misc comment --- larray/core/abstractbases.py | 1 + larray/core/array.py | 10 ++++++++-- larray/core/axis.py | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/larray/core/abstractbases.py b/larray/core/abstractbases.py index 0546bb6a6..d5d766673 100644 --- a/larray/core/abstractbases.py +++ b/larray/core/abstractbases.py @@ -5,6 +5,7 @@ # define abstract base classes to enable isinstance type checking on our objects # idea taken from https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/generic.py +# FIXME: __metaclass__ is ignored in Python 3 class ABCAxis(object): __metaclass__ = ABCMeta diff --git a/larray/core/array.py b/larray/core/array.py index e260c6201..53e4d8195 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -17,7 +17,7 @@ # * Axis.sequence? geo.seq('A31', 'A38') (equivalent to geo['A31..A38']) -# * re-implement row_totals/col_totals? or what do we do with them? +# ? re-implement row_totals/col_totals? or what do we do with them? # * time specific API so that we know if we go for a subclass or not @@ -396,7 +396,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): # we still need to prepare the key instead of letting numpy handle everything so that - # existing (integer)LArray keys are handled correctly (broadcasted using axes names). + # existing (integer)LArray keys are broadcasted correctly (using axes names). self.array.__setitem__(self._prepare_key(key, wildcard=True), value, translate_key=False) @@ -1272,6 +1272,9 @@ def __bool__(self): # Python 2 __nonzero__ = __bool__ + # TODO: this should be a thin wrapper around a method in AxisCollection + # TODO: either support a list (of axes names) as first argument here (and set_labels) + # or don't support that in set_axes def rename(self, renames=None, to=None, inplace=False, **kwargs): """Renames axes of the array. @@ -8252,6 +8255,7 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): axis = Axis(axis) if elements is None: if not isinstance(axis, Axis) and sys.version_info[:2] < (3, 6): + # XXX: this should probably be a warning, not an error raise TypeError("axis argument should provide label order when using keyword arguments on Python < 3.6") elements = kwargs.items() elif kwargs: @@ -8267,6 +8271,8 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): axis = elements.axes[axis] values = [elements[k] for k in axis] elif isinstance(elements, dict): + # TODO: support having no Axis object for Python3.7 (without error or warning) + # XXX: we probably want to support this with a warning on Python < 3.7 assert isinstance(axis, Axis) values = [elements[v] for v in axis.labels] elif isinstance(elements, Iterable): diff --git a/larray/core/axis.py b/larray/core/axis.py index 46ab1d79d..a6c7b0058 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -104,6 +104,7 @@ def __init__(self, labels, name=None): self.__sorted_values = None self._length = None self._iswildcard = False + # set _labels, _length and _iswildcard via the property self.labels = labels @property @@ -2059,6 +2060,9 @@ def copy(self): """ return self[:] + # XXX: what's the point in supporting a list of Axis or AxisCollection in axes_to_replace? + # it is used in LArray.set_axes but if it is only there, shouldn't the support for that be + # moved there? def replace(self, axes_to_replace=None, new_axis=None, inplace=False, **kwargs): """Replace one, several or all axes of the collection. From 319a124ea68910e072fb00064c4ba9ad337f6939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 15:38:34 +0200 Subject: [PATCH 09/44] better docstrings in various functions & methods --- larray/core/array.py | 45 +++++++++++++++++++++++++++-------------- larray/core/metadata.py | 4 +--- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 53e4d8195..4ee884cf9 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -250,7 +250,7 @@ def concat(arrays, axis=0, dtype=None): arrays : tuple of LArray Arrays to concatenate. axis : axis reference (int, str or Axis), optional - Axis along which to concatenate. Defaults to the first axis. + Axis along which to concatenate. All arrays must have that axis. Defaults to the first axis. dtype : dtype, optional Result data type. Defaults to the "closest" type which can hold all arrays types without loss of information. @@ -383,7 +383,7 @@ def __setitem__(self, key, value): # TODO: rename to LArrayIndexPointsIndexer or something like that class LArrayPositionalPointsIndexer(object): """ - the closer to numpy indexing we get, but not 100% the same. + the closest to numpy indexing we get, but not 100% the same. """ def __init__(self, array): self.array = array @@ -5444,7 +5444,7 @@ def divnot0(self, other): # XXX: rename/change to "add_axes" ? # TODO: add a flag copy=True to force a new array. def expand(self, target_axes=None, out=None, readonly=False): - """Expands array to target_axes. + r"""Expands array to target_axes. Target axes will be added to array if not present. In most cases this function is not needed because LArray can do operations with arrays having different @@ -5452,7 +5452,7 @@ def expand(self, target_axes=None, out=None, readonly=False): Parameters ---------- - target_axes : list of Axis or AxisCollection, optional + target_axes : string, list of Axis or AxisCollection, optional Self can contain axes not present in `target_axes`. The result axes will be: [self.axes not in target_axes] + target_axes out : LArray, optional @@ -5471,22 +5471,37 @@ def expand(self, target_axes=None, out=None, readonly=False): >>> b = Axis('b=b1,b2') >>> arr = ndtest([a, b]) >>> arr - a\\b b1 b2 + a\b b1 b2 a1 0 1 a2 2 3 + + Adding one or several axes will append the new axes at the end + >>> c = Axis('c=c1,c2') + >>> arr.expand(c) + a b\c c1 c2 + a1 b1 0 0 + a1 b2 1 1 + a2 b1 2 2 + a2 b2 3 3 + + If you want to new axes to be inserted in a particular order, you have to give that order + >>> arr.expand([a, c, b]) - a c\\b b1 b2 + a c\b b1 b2 + a1 c1 0 1 + a1 c2 0 1 + a2 c1 2 3 + a2 c2 2 3 + + But it is enough to list only the added axes and the axes after them: + + >>> arr.expand([c, b]) + a c\b b1 b2 a1 c1 0 1 a1 c2 0 1 a2 c1 2 3 a2 c2 2 3 - >>> arr.expand([b, c]) - a b\\c c1 c2 - a1 b1 0 0 - a1 b2 1 1 - a2 b1 2 2 - a2 b2 3 3 """ if target_axes is None and out is None or target_axes is not None and out is not None: raise ValueError("either target_axes or out must be defined (not both)") @@ -6700,8 +6715,8 @@ def shift(self, axis, n=1): ---------- axis : int, str or Axis Axis for which we want to perform the shift. - n : int - Number of cells to shift. + n : int, optional + Number of cells to shift. Defaults to 1. Returns ------- @@ -8148,7 +8163,7 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): Returns ------- LArray - A single array combining arrays. + A single array combining arrays. The new (stacked) axes will be the last axes of the new array. Examples -------- diff --git a/larray/core/metadata.py b/larray/core/metadata.py index 1a7387f53..d208c0f31 100644 --- a/larray/core/metadata.py +++ b/larray/core/metadata.py @@ -89,7 +89,6 @@ def __repr__(self): else: class AttributeDict(OrderedDict): - def __getattr__(self, key): try: return self[key] @@ -124,7 +123,7 @@ class Metadata(AttributeDict): >>> # Python 2 or <= 3.5 >>> arr = ndtest((3, 3), meta=[('title', 'the title'), ('author', 'John Smith')]) >>> # Python 3.6+ - >>> arr = ndtest((3, 3), meta=Metadata(title = 'the title', author = 'John Smith')) # doctest: +SKIP + >>> arr = ndtest((3, 3), meta=Metadata(title='the title', author='John Smith')) # doctest: +SKIP Add metadata after array initialization @@ -143,7 +142,6 @@ class Metadata(AttributeDict): >>> del arr.meta.creation_date """ - # TODO: use LArray.from_dict once ready (issue 581) def __larray__(self): from larray.core.array import LArray From 018867611fc4156aa53f0768f23eaa8781889317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 17 Oct 2018 15:58:46 +0200 Subject: [PATCH 10/44] document LArray.as_table(light=True) --- larray/core/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/larray/core/array.py b/larray/core/array.py index 4ee884cf9..763a60642 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2303,6 +2303,9 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam only the first and last `edgeitems` lines are displayed. Only active if `maxlines` is not None. Equals to 5 by default. + light : bool, optional + Whether or not to hide repeated labels. In other words, only show a label if it is different from the + previous one. Defaults to False. wide : boolean, optional Whether or not to write arrays in "wide" format. If True, arrays are exported with the last axis represented horizontally. If False, arrays are exported in "narrow" format: one column per axis plus one From fc7f32da7b5d66fb110529a86a708fe6f9b82c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 17:43:41 +0200 Subject: [PATCH 11/44] updated design notes --- design.txt | 383 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 230 insertions(+), 153 deletions(-) diff --git a/design.txt b/design.txt index 3bd221c9d..07d407f99 100644 --- a/design.txt +++ b/design.txt @@ -1,6 +1,4 @@ - -a(sex, age) -age_limit(sex) +assuming the following arrays: a(sex, age) and age_limit(sex) step 1: @@ -10,10 +8,10 @@ b = a * (age > age_limit) step 2: -a[x.age > age_limit] -# this is also possible ("x.age > age_limit" return an Expr, expr is evaluated +a[X.age > age_limit] +# this is also possible ("X.age > age_limit" return an Expr, expr is evaluated # during the binop (axes ref replace by real axe) -b = a * (x.age > age_limit) +b = a * (X.age > age_limit) ============== in general: @@ -55,31 +53,28 @@ in general: # API for ND groups (my example is mixing label with positional): -# union (bands): x.axis1[5:10] | x.axis2.i[3:4] -# intersection/cross/default: x.axis1[5:10] & x.axis2.i[3:4] -# points: x.axis1[5:10] ^ x.axis2.i[1:6] -# ----> this prevents symetric difference. this is little used but... -# ----> Points(x.axis[5:10], x.axis2.i[1:6]) +# union (bands): X.axis1[5:10] | X.axis2.i[3:4] +# intersection/cross/default: X.axis1[5:10] & X.axis2.i[3:4] +# points: +# * X.axis1[5:10] ^ X.axis2.i[1:6] --> this prevents symetric difference. this is little used but... +# * Points(X.axis[5:10], X.axis2.i[1:6]) +# * X.axis[5:10].combine(X.axis2.i[1:6]) # this is very nice and would have orderedset-like semantics # it does not seem to conflict with the axis methods (even though that might be # confusing): -# x.axis1 | x.axis2 would have a very different meaning than -# x.axis1[:] | x.axis2[:] +# X.axis1 | X.axis2 would have a very different meaning than +# X.axis1[:] | X.axis2[:] # Note that cross sections is the default and it is useless to introduce # another API **except to give a name**, so the & syntax is useless unless # we allow naming groups after the fact -# => NDGroup((x.axis1[5:10], x.axis2.i[2.5]), 'exports') -# => Group((x.axis1[5:10], x.axis2.i[2.5]), 'exports') -# => (x.axis1[5:10] & x.axis2.i[2.5]).named('exports') - -# generalizing "named" and suppressing .group seems like a good idea! -# => x.axis1.group([5, 7, 10], name='brussels') -# => x.axis1[5, 7, 10].named('brussels') +# => NDGroup((X.axis1[5:10], X.axis2.i[2.5]), 'exports') +# => Group((X.axis1[5:10], X.axis2.i[2.5]), 'exports') +# => (X.axis1[5:10] & X.axis2.i[2.5]).named('exports') # http://xarray.pydata.org/en/stable/indexing.html#pointwise-indexing # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.lookup.html#pandas.DataFrame.lookup @@ -92,14 +87,14 @@ in general: # I wonder if, for axes subscripting, I could not allow tuples as sequences, # which would make it a bit nicer: -# x.axis1[5, 7, 10].named('brussels') +# X.axis1[5, 7, 10].named('brussels') # instead of -# x.axis1[[5, 7, 10]].named('brussels') +# X.axis1[[5, 7, 10]].named('brussels') # since axes are always 1D, this is not a direct problem. However the # question is whether this would lead to an inconsistent API/confuse users # because they would still have to write the brackets when no axis is present # a[[5, 7, 9]] -# a[x.axis1[5, 7, 9]] +# a[X.axis1[5, 7, 9]] # in practice, this syntax is little used anyway # options @@ -210,7 +205,7 @@ ou alors on utilise une méthode spécifique pour split (split ou groups ou multi): G.split[2, 5] == G[2], G[5] -G.clength.split[2, 5:10, 20] == G.clength[2], G.clength[5:10], G.clength[5] +G.clength.split[2, 5:10, 20] == G.clength[2], G.clength[5:10], G.clength[20] G.clength.split[2, 5] == G.clength[2], G.clength[5] @@ -347,14 +342,14 @@ G.clength.split[2, 5] == G.clength[2], G.clength[5] # m = {G[2:7, 'M']: 1, G[2:7, 'F']: 2, G[5:10, 'M']: 3, G[5:10, 'F']: 4} # breaks if combination of axes -# a.set(x.age[m]) +# a.set(X.age[m]) 2) multiple range in same [] means "and" ========================================= - => set op if same axis, ND group otherwise + => and set op if same axis, ND group otherwise - G.age[5, 7, 9] == G.age[5] & G.age[7] & G.age[9] => BREAKS ! - => must use G.age[[5, 7, 9]] + G.age[5, 7, 9] == G.age[5] & G.age[7] & G.age[9] => EMPTY group ! + => MUST use double brackets: G.age[[5, 7, 9]] G.age[:20, 10:30] == G.age[:20] & G.age[10:30] == G.age[20:30] G[2:7, 'M', ['P01', 'P05']] == G[2:7] & G['M'] & G['P01', 'PO5'] @@ -394,16 +389,50 @@ G.clength.split[2, 5] == G.clength[2], G.clength[5] or whether slice or scalar =========================================================================== +7) multiple range in same [] are only allowed for same axis (and means "or") +============================================================================ + => set op if same axis, different axis not allowed + => the definition of a Group is: a list of labels of one axis + implies more or less that we must have a different object for ND Groups + => implies more or less that we will not support + array['5, 7, 11, P01,P05, M'] + array[5, 7, 11, 'P01, P05, M'] + this is fine though: + array['5, 7, 11; P01, P05; M'] + array[[5, 7, 11], ['P01', 'P05'], 'M'] + array[[5, 7, 11], 'P01, P05', 'M'] + and maybe this too: + array[[5, 7, 11], 'P01, P05; M'] + + G.age[5, 7, 11] == G.age[[5, 7, 11]] == G.age[5] | G.age[7] | G.age[9] + G.age[5, 7:9, 11] == G.age[5, 7, 8, 9, 11] + G.age[:20, 10:30] == G.age[:20] | G.age[10:30] == G.age[:30] + G[5, 7:9, 'M', ['P01', 'P05']] --> fails because it tries to find a single axis containing all of those + G[5, 7:9] & G['M'] & G['P01', 'P05'] --> works (returns NDGroup) + G['5, 7:9; P01,P05; M'] --> returns NDGroup (same as above) + G[[5, 7:9], ['P01', 'P05'], 'M'] --> fails 7:9 is sadly an invalid syntax + G[[5 7,8,9], 'P01, P05', 'M'] --> works too + == age[5, 7, 8, 9] & sex['M'] & lipro['P01', 'PO5'] + == NDGroup([[5, 7, 8, 9], 'M', ['P01', 'P05']], axes=['age', 'sex', 'lipro']) + OR + == NDGroup({'age': [5, 7, 8, 9], + 'sex': 'M' + 'lipro': ['P01', 'P05']}) + + '5,7:9; M; P01,P05' + 'age[5,7:9]; sex[M]; lipro[P01,P05]' + # use cases # 1) simple get/set +a['2:7; M; P01,P02'] a[2:7, 'M', ['P01', 'P02']] # 2) boolean selection -a[(x.age < 10) | (x.clength > 5)] +a[(X.age < 10) | (X.clength > 5)] # 3) simple with ambiguous values @@ -413,16 +442,19 @@ a[G.age[2:7], G.clength[5, 7, 9], 'M', ['P01', 'P02']] a[G.age[2:4] ^ G.clength[5, 7, 9], 'M', ['P01', 'P02']] a[G[2, 9, 3] ^ G['M', 'F', 'M'], ['P01', 'P02']] +# set "diagonal" to 0 +countries = ... +use[src[countries] ^ dst[countries]] = 0 + # 4b) lookup (this is a form of point-selection), wh potentially repeated values person_age = [5, 1, 2, 1, 6, 5] person_gender = ['M', 'F', 'F', 'M', 'M', 'F'] person_workstate = [1, 3, 1, 2, 1, 3] -income = mean_income[person_age, person_gender] # <-- no ! does cross product +income = mean_income[person_age, person_gender] # <-- FAILS ! (it does a cross product) income = mean_income[G[person_age] ^ G[person_gender]] -income = mean_income[G.points[person_age, person_gender]] # <-- disallow having - # an axis named - # "points" +income = mean_income[G[person_age].combine(G[person_gender])] +income = mean_income[G.points[person_age, person_gender]] # <-- disallow having an axis named "points" income = mean_income.points[person_age, person_gender] # if ambiguous income = mean_income.points[G.age[person_age], person_gender] @@ -453,7 +485,7 @@ income = extra_income[LK[person_gender] & (LK[workstate] == 1)] # .points by default. # A: yes, that's an option but would not solve the "set" problem. -# => I NEED a way to set the axis on an LKey. maybe x.abc[LK] should not +# => I NEED a way to set the axis on an LKey. maybe X.abc[LK] should not # return an LSet? but an LKey with an axis. @@ -520,8 +552,8 @@ act3 = table(['sub', '40+', '-39', '40+'], # 6) multi slices, aggregate (one group per slice) -# groups = (x.clength[1:15], x.clength[16:25], x.clength[26:30], -# x.clength[31:35], x.clength[36:40], x.clength[41:50]) +# groups = (X.clength[1:15], X.clength[16:25], X.clength[26:30], +# X.clength[31:35], X.clength[36:40], X.clength[41:50]) # agg = arr.sum(groups) # groups = G.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50] @@ -532,46 +564,46 @@ act3 = table(['sub', '40+', '-39', '40+'], # 7) multi slices, assign one value per slice # multip_mat_min = zeros([clength, year]) -# multip_mat_min[x.clength[1:15], x.year[first_year_p:2024]] = 7 / 7 -# multip_mat_min[x.clength[16:25], x.year[first_year_p:2024]] = 20 / 20 -# multip_mat_min[x.clength[26:30], x.year[first_year_p:2024]] = 27 / 27 -# multip_mat_min[x.clength[31:35], x.year[first_year_p:2024]] = 32 / 32 -# multip_mat_min[x.clength[36:40], x.year[first_year_p:2024]] = 37 / 37 -# multip_mat_min[x.clength[41:50], x.year[first_year_p:2024]] = 42 / 42 -# multip_mat_min[x.clength[1:15], x.year[2025:2029]] = 8 / 7 -# multip_mat_min[x.clength[16:25], x.year[2025:2029]] = 21 / 20 -# multip_mat_min[x.clength[26:30], x.year[2025:2029]] = 28 / 27 -# multip_mat_min[x.clength[31:35], x.year[2025:2029]] = 33 / 32 -# multip_mat_min[x.clength[36:40], x.year[2025:2029]] = 38 / 37 -# multip_mat_min[x.clength[41:50], x.year[2025:2029]] = 43 / 42 -# multip_mat_min[x.clength[1:15], x.year[2030:]] = 9 / 7 -# multip_mat_min[x.clength[16:25], x.year[2030:]] = 22 / 20 -# multip_mat_min[x.clength[26:30], x.year[2030:]] = 29 / 27 -# multip_mat_min[x.clength[31:35], x.year[2030:]] = 34 / 32 -# multip_mat_min[x.clength[36:40], x.year[2030:]] = 39 / 37 -# multip_mat_min[x.clength[41:50], x.year[2030:]] = 44 / 42 +# multip_mat_min[X.clength[1:15], X.year[first_year_p:2024]] = 7 / 7 +# multip_mat_min[X.clength[16:25], X.year[first_year_p:2024]] = 20 / 20 +# multip_mat_min[X.clength[26:30], X.year[first_year_p:2024]] = 27 / 27 +# multip_mat_min[X.clength[31:35], X.year[first_year_p:2024]] = 32 / 32 +# multip_mat_min[X.clength[36:40], X.year[first_year_p:2024]] = 37 / 37 +# multip_mat_min[X.clength[41:50], X.year[first_year_p:2024]] = 42 / 42 +# multip_mat_min[X.clength[1:15], X.year[2025:2029]] = 8 / 7 +# multip_mat_min[X.clength[16:25], X.year[2025:2029]] = 21 / 20 +# multip_mat_min[X.clength[26:30], X.year[2025:2029]] = 28 / 27 +# multip_mat_min[X.clength[31:35], X.year[2025:2029]] = 33 / 32 +# multip_mat_min[X.clength[36:40], X.year[2025:2029]] = 38 / 37 +# multip_mat_min[X.clength[41:50], X.year[2025:2029]] = 43 / 42 +# multip_mat_min[X.clength[1:15], X.year[2030:]] = 9 / 7 +# multip_mat_min[X.clength[16:25], X.year[2030:]] = 22 / 20 +# multip_mat_min[X.clength[26:30], X.year[2030:]] = 29 / 27 +# multip_mat_min[X.clength[31:35], X.year[2030:]] = 34 / 32 +# multip_mat_min[X.clength[36:40], X.year[2030:]] = 39 / 37 +# multip_mat_min[X.clength[41:50], X.year[2030:]] = 44 / 42 # # # already possible # m = zeros(clength) -# m[x.clength[1:15]] = 7 -# m[x.clength[16:25]] = 20 -# m[x.clength[26:30]] = 27 -# m[x.clength[31:35]] = 32 -# m[x.clength[36:40]] = 37 -# m[x.clength[41:50]] = 42 +# m[X.clength[1:15]] = 7 +# m[X.clength[16:25]] = 20 +# m[X.clength[26:30]] = 27 +# m[X.clength[31:35]] = 32 +# m[X.clength[36:40]] = 37 +# m[X.clength[41:50]] = 42 # multip_mat_min = zeros([clength, year]) -# multip_mat_min[x.year[:2024]] = m / m -# multip_mat_min[x.year[2025:2029]] = (m + 1) / m -# multip_mat_min[x.year[2030:]] = (m + 2) / m +# multip_mat_min[X.year[:2024]] = m / m +# multip_mat_min[X.year[2025:2029]] = (m + 1) / m +# multip_mat_min[X.year[2030:]] = (m + 2) / m # >>> very nice for this case but it does not scale very well with number of # values to set. On the other hand, splitting it in case it does not fit # on a line is not TOO horrible (just a bit horrible ;-)) # m = zeros(clength) -# m[x.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = \ +# m[X.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = \ # [ 7, 20, 27, 32, 37, 42] # multip_mat_min = zeros([clength, year]) -# multip_mat_min[x.year[:2024, 2025:2029, 2030:]] = \ +# multip_mat_min[X.year[:2024, 2025:2029, 2030:]] = \ # [m / m, (m + 1) / m, (m + 2) / m] # m = zeros(clength) @@ -613,12 +645,12 @@ act3 = table(['sub', '40+', '-39', '40+'], # a[G.age[5, 7, 9]] # a[G.geo[5, 7, 9].named('brussels')] -# a[x.age[G[5, 7, 9]]] -# a[x.age[G[5, 7, 9].named('brussels')]] +# a[X.age[G[5, 7, 9]]] +# a[X.age[G[5, 7, 9].named('brussels')]] # a[G.get('strange axis')[5, 7, 9].named('Brussels')] -# a[x.age[5, 7, 9]] +# a[X.age[5, 7, 9]] # positional groups *without axis* (G.i, P[], or I[]) does not make much sense, # because it will matches all axes, but might be useful as an intermediate @@ -673,29 +705,29 @@ act3 = table(['sub', '40+', '-39', '40+'], # we also need the best possible syntax to handle, "arbitrary" resampling # pure_min_w1_comp_agg = zeros(result_axes) -# pure_min_w1_comp_agg[x.LBMosesXLS[1]] = pure_min_w1_comp.sum(x.clength[1:15]) -# pure_min_w1_comp_agg[x.LBMosesXLS[2]] = pure_min_w1_comp.sum(x.clength[16:25]) -# pure_min_w1_comp_agg[x.LBMosesXLS[3]] = pure_min_w1_comp.sum(x.clength[26:30]) -# pure_min_w1_comp_agg[x.LBMosesXLS[4]] = pure_min_w1_comp.sum(x.clength[31:35]) -# pure_min_w1_comp_agg[x.LBMosesXLS[5]] = pure_min_w1_comp.sum(x.clength[36:40]) -# pure_min_w1_comp_agg[x.LBMosesXLS[6]] = pure_min_w1_comp.sum(x.clength[41:50]) +# pure_min_w1_comp_agg[X.LBMosesXLS[1]] = pure_min_w1_comp.sum(X.clength[1:15]) +# pure_min_w1_comp_agg[X.LBMosesXLS[2]] = pure_min_w1_comp.sum(X.clength[16:25]) +# pure_min_w1_comp_agg[X.LBMosesXLS[3]] = pure_min_w1_comp.sum(X.clength[26:30]) +# pure_min_w1_comp_agg[X.LBMosesXLS[4]] = pure_min_w1_comp.sum(X.clength[31:35]) +# pure_min_w1_comp_agg[X.LBMosesXLS[5]] = pure_min_w1_comp.sum(X.clength[36:40]) +# pure_min_w1_comp_agg[X.LBMosesXLS[6]] = pure_min_w1_comp.sum(X.clength[41:50]) # -# clength_groups = (x.clength[1:15], x.clength[16:25], x.clength[26:30], -# x.clength[31:35], x.clength[36:40], x.clength[41:50]) +# clength_groups = (X.clength[1:15], X.clength[16:25], X.clength[26:30], +# X.clength[31:35], X.clength[36:40], X.clength[41:50]) # pure_min_w1_comp_agg2 = pure_min_w1_comp.sum(clength_groups).rename( -# x.clength, x.LBMosesXLS) +# X.clength, X.LBMosesXLS) # clength_groups = (L[1:15], L[16:25], L[26:30], # L[31:35], L[36:40], L[41:50]) # pure_min_w1_comp_agg2 = pure_min_w1_comp.sum(clength_groups).rename( -# x.clength, x.LBMosesXLS) +# X.clength, X.LBMosesXLS) # -# clength_groups = x.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50] +# clength_groups = X.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50] # pure_min_w1_comp_agg2 = pure_min_w1_comp.sum(clength_groups) # # clength_groups = G[1:15, 16:25, 26:30, 31:35, 36:40, 41:50] # pure_min_w1_comp_agg2 = pure_min_w1_comp.sum(clength_groups) \ -# .replace(x.clength, LBMosesXLS) +# .replace(X.clength, LBMosesXLS) # XXX: what if I want to sum on all the slices (as if it was a single slice) # clength_groups = G[1:15] | G[16:25] | G[26:30] | G[31:35] | G[36:40] | G[41:50] @@ -715,40 +747,40 @@ act3 = table(['sub', '40+', '-39', '40+'], # ]) # # multip_mat_min = zeros([clength, year]) -# multip_mat_min[x.clength[1:15], x.year[first_year_p:2024]] = 7 / 7 -# multip_mat_min[x.clength[16:25], x.year[first_year_p:2024]] = 20 / 20 -# multip_mat_min[x.clength[26:30], x.year[first_year_p:2024]] = 27 / 27 -# multip_mat_min[x.clength[31:35], x.year[first_year_p:2024]] = 32 / 32 -# multip_mat_min[x.clength[36:40], x.year[first_year_p:2024]] = 37 / 37 -# multip_mat_min[x.clength[41:50], x.year[first_year_p:2024]] = 42 / 42 -# multip_mat_min[x.clength[1:15], x.year[2025:2029]] = 8 / 7 -# multip_mat_min[x.clength[16:25], x.year[2025:2029]] = 21 / 20 -# multip_mat_min[x.clength[26:30], x.year[2025:2029]] = 28 / 27 -# multip_mat_min[x.clength[31:35], x.year[2025:2029]] = 33 / 32 -# multip_mat_min[x.clength[36:40], x.year[2025:2029]] = 38 / 37 -# multip_mat_min[x.clength[41:50], x.year[2025:2029]] = 43 / 42 -# multip_mat_min[x.clength[1:15], x.year[2030:]] = 9 / 7 -# multip_mat_min[x.clength[16:25], x.year[2030:]] = 22 / 20 -# multip_mat_min[x.clength[26:30], x.year[2030:]] = 29 / 27 -# multip_mat_min[x.clength[31:35], x.year[2030:]] = 34 / 32 -# multip_mat_min[x.clength[36:40], x.year[2030:]] = 39 / 37 -# multip_mat_min[x.clength[41:50], x.year[2030:]] = 44 / 42 +# multip_mat_min[X.clength[1:15], X.year[first_year_p:2024]] = 7 / 7 +# multip_mat_min[X.clength[16:25], X.year[first_year_p:2024]] = 20 / 20 +# multip_mat_min[X.clength[26:30], X.year[first_year_p:2024]] = 27 / 27 +# multip_mat_min[X.clength[31:35], X.year[first_year_p:2024]] = 32 / 32 +# multip_mat_min[X.clength[36:40], X.year[first_year_p:2024]] = 37 / 37 +# multip_mat_min[X.clength[41:50], X.year[first_year_p:2024]] = 42 / 42 +# multip_mat_min[X.clength[1:15], X.year[2025:2029]] = 8 / 7 +# multip_mat_min[X.clength[16:25], X.year[2025:2029]] = 21 / 20 +# multip_mat_min[X.clength[26:30], X.year[2025:2029]] = 28 / 27 +# multip_mat_min[X.clength[31:35], X.year[2025:2029]] = 33 / 32 +# multip_mat_min[X.clength[36:40], X.year[2025:2029]] = 38 / 37 +# multip_mat_min[X.clength[41:50], X.year[2025:2029]] = 43 / 42 +# multip_mat_min[X.clength[1:15], X.year[2030:]] = 9 / 7 +# multip_mat_min[X.clength[16:25], X.year[2030:]] = 22 / 20 +# multip_mat_min[X.clength[26:30], X.year[2030:]] = 29 / 27 +# multip_mat_min[X.clength[31:35], X.year[2030:]] = 34 / 32 +# multip_mat_min[X.clength[36:40], X.year[2030:]] = 39 / 37 +# multip_mat_min[X.clength[41:50], X.year[2030:]] = 44 / 42 # # # already possible # m = zeros(clength) -# m[x.clength[1:15]] = 7 -# m[x.clength[16:25]] = 20 -# m[x.clength[26:30]] = 27 -# m[x.clength[31:35]] = 32 -# m[x.clength[36:40]] = 37 -# m[x.clength[41:50]] = 42 +# m[X.clength[1:15]] = 7 +# m[X.clength[16:25]] = 20 +# m[X.clength[26:30]] = 27 +# m[X.clength[31:35]] = 32 +# m[X.clength[36:40]] = 37 +# m[X.clength[41:50]] = 42 # multip_mat_min = zeros([clength, year]) -# multip_mat_min[x.year[:2024]] = m / m -# multip_mat_min[x.year[2025:2029]] = (m + 1) / m -# multip_mat_min[x.year[2030:]] = (m + 2) / m +# multip_mat_min[X.year[:2024]] = m / m +# multip_mat_min[X.year[2025:2029]] = (m + 1) / m +# multip_mat_min[X.year[2030:]] = (m + 2) / m # TODO: it would be nice to be able to say: -# m[x.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = [7, 20, 27, 32, 37, 42] +# m[X.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = [7, 20, 27, 32, 37, 42] # but I am unsure it is possible/unambiguous # this kind of pattern is not supported by numpy @@ -773,13 +805,12 @@ act3 = table(['sub', '40+', '-39', '40+'], # (and if not, whether or not we can come up with a syntax that is both nice # and not ambiguous) -# m[x.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = \ -# [7, 20, 27, 32, 37, 42] -# multip_mat_min[x.year[:2024, 2025:2029, 2030:]] = [m / m, (m + 1) / m, +# m[X.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] = [7, 20, 27, 32, 37, 42] +# multip_mat_min[X.year[:2024, 2025:2029, 2030:]] = [m / m, (m + 1) / m, # (m + 2) / m] # for the multi-value case to work I would probably have to make -# m[x.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] +# m[X.clength[1:15, 16:25, 26:30, 31:35, 36:40, 41:50]] # return multiple arrays (as a tuple of arrays or an array of arrays) # with pandas/MI support, we could just return an array with # a (second) clength axis @@ -798,18 +829,19 @@ act3 = table(['sub', '40+', '-39', '40+'], # aggregate) # ideally s.sum() would first sum each array then sum those sums -# and s.sum(x.age) would sum each array along age -# and s.sum(x.arrays) would try to add arrays together (and fail in +# and s.sum(X.age) would sum each array along age +# and s.sum(X.arrays) would try to add arrays together (and fail in # some/most cases) # the problem is that one important use case is not covered: -# aggregating along all dimensions of the arrays but NOT on x.arrays +# aggregating along all dimensions of the arrays but NOT on X.arrays +# but see below for solutions # Q: s.elements.sum() (or s.arrays.sum()) vs s.sum() solve this? # A1: s.arrays.sum() would dispatch to each array and return a new Session # s.sum() would try to do s.arrays.sum().sum() # seems doable... -# A2: s.sum_by(x.arrays) (like Pandas default aggregate) would solve +# A2: s.sum_by(X.arrays) (like Pandas default aggregate) would solve # the issue even more nicely, but this is a bit more work (is it?) and # can be safely added later. @@ -821,7 +853,7 @@ act3 = table(['sub', '40+', '-39', '40+'], # A: # Q: what happens when you do s1 + s2 ? -# A: same than s1.arrays + s2.arrays +# A: same than [a1 + a2 for a1, a2 in zip(s1, s2)] # if we view s1 as a big array with an extra dimension, it would give # that result (modulo union of names until we are Pandas based) @@ -835,7 +867,7 @@ act3 = table(['sub', '40+', '-39', '40+'], # (s1 == s2).all() # Q: what if I want to know which arrays are equal and which are not? -# A: (s1 == s2).all_by(x.arrays) +# A: (s1 == s2).all_by(X.arrays) # boolean ops # =========== @@ -908,20 +940,20 @@ act3 = table(['sub', '40+', '-39', '40+'], >>> a.sum('10:19 > 10_19 ; 20:29 > 20_29 ; year=#-1') >>> a.sum('(10:19 > 10_19 ; 20:29 > 20_29) & year=#-1') >>> teens = G['age=10:19 >> teens'] - >>> teens = x.age[10:19].named('teens') + >>> teens = X.age[10:19].named('teens') >>> twenties = G['age=20:29'] >>> a.sum('({teens}, {twenties})') >>> a.sum((teens, twenties)) # will we ever want to support this? >>> a.sum('age > clength') >>> a.sum('age > {ext}') - >>> a.sum(x.age > ext) + >>> a.sum(X.age > ext) >>> a.sum('age > 10') LGroup(['a', 'b', 'c'], name='abc') -expend_flow[x.cat_from['married_women'], x.cat_to['retirement_survival_women'], y] = \ - flow[x.cat_from['married_women'], x.cat_to['retirement_survival_women'], y] * \ +expend_flow[X.cat_from['married_women'], X.cat_to['retirement_survival_women'], y] = \ + flow[X.cat_from['married_women'], X.cat_to['retirement_survival_women'], y] * \ pension_age_diff_lag['married_men', y] * 1.1 * (45 / average_clength_survival['married_men', y]) expend_flow['cat_from[married_women], cat_to[retirement_survival_women]', y] = \ @@ -957,45 +989,45 @@ expend_flow['cat_from[married_women], cat_to[retirement_survival_women]', y] = \ # ================ set operation on groups =============== # ======================================================== -we want + and - ops on groups to be both set operation -or arithmetic operation depending on the case. +PROBLEM: we want __sub__ op on groups to be both a set operation or arithmetic operation depending on the case. for y in time[start_year + 1:]: res = a[y + 1] -for c in sutcode.matches('^...$') + sutcode.matches('^..$') - 'ND': - g = sutcode.startswith(c) - c +for c in sutcode.matching('^...$') + sutcode.matching('^..$') - 'ND': + g = sutcode.startingwith(c) - c -# option 1 -# ======== -op on evaluated key by default (whatever it is -- scalar or ndarray) +# option 1 (current) +# ================== + +execute __op__ on key.eval() by default (whatever it is -- scalar or ndarray) set ops must use specific methods for y in time[start_year + 1:]: res = a[y + 1] -for c in sutcode.matches('^...$').union(sutcode.matches('^..$')).setdiff('ND'): - g = sutcode.startswith(c).setdiff(c) -for c in sutcode.matches('^...$').union(sutcode.matches('^..$')).difference('ND'): - g = sutcode.startswith(c).difference(c) +for c in sutcode.matching('^...$').union(sutcode.matching('^..$')).difference('ND'): + g = sutcode.startingwith(c).difference(c) -# option 2 -# ======== -op on evaluated key by default (whatever it is -- scalar or ndarray) -convert LGroup to LSet using method a specific method +# option 2 (current too) +# ====================== + +execute __op__ on key.eval() by default (whatever it is -- scalar or ndarray) +convert LGroup to LSet using a specific method for y in time[start_year + 1:]: res = a[y + 1] # the second .set() is optional -for c in sutcode.matches('^...$').set() | sutcode.matches('^..$').set() - 'ND': - g = sutcode.startswith(c).set() - c +for c in sutcode.matching('^...$').set() | sutcode.matching('^..$').set() - 'ND': + g = sutcode.startingwith(c).set() - c -# option 3 (current) -# ================== + +# option 3 (before) +# ================= set op on evaluated key by default need to use .labels on the axis or .eval() on the group to do arithmetic ops @@ -1005,31 +1037,78 @@ for y in time.labels: for y in time[start_year + 1:].eval(): res = a[y + 1] -for c in sutcode.matches('^...$') | sutcode.matches('^..$') - 'ND': - g = sutcode.startswith(c) - c +for c in sutcode.matching('^...$') | sutcode.matching('^..$') - 'ND': + g = sutcode.startingwith(c) - c + # option 4 # ======== -set op if sequence, arithmetic if scalar. This looks good in our example and is usually what people want -but this is not the path of least surprise ! +set op if "current" (left object) is a sequence, arithmetic if scalar. +This looks good in our example and is usually what people want but this is not the path of least surprise ! + +for y in time[start_year + 1:]: + # expected result (arithmetic) + res = a[y + 1] + +# expected result (set op) +for c in sutcode.matching('^...$') | sutcode.matching('^..$') - 'ND': + # expected result (set op) + g = sutcode.startingwith(c) - c + +# UNEXPECTED result (set op) +for age in age[1:] + 1: + + +# option 5 +# ======== + +set op if string type (scalar or sequence), arithmetic if *numeric* (scalar or sequence). +This also looks good in our example and is usually what people want but this can lead to surprises too ! for y in time[start_year + 1:]: + # expected result (arithmetic) res = a[y + 1] -for c in sutcode.matches('^...$') | sutcode.matches('^..$') - 'ND': - g = sutcode.startswith(c) - c +# expected result (set op) +for c in sutcode.matching('^...$') | sutcode.matching('^..$') - 'ND': + # expected result (set op) + g = sutcode.startingwith(c) - c -# an example of unexpected result would be: -age[1:] + 1 +# expected result (arith op) +for age in age[1:] + 1: + ... +# unexpected result (arith op) +codes.in_([1, 2, 5]) - bad_code + + +# option 6 (variant of option 4) +# ============================== + +iter(Axis) and iter(Group) return a Label, not a Group. +now we could make both axis[a_single_label] returns a Label or a Group, but it would probably be cleaner to kill +"scalar" groups altogether, so we would have: +* axis[a_single_label] returns a Label +* axis[[a_single_label]] return a Group with a single element. This is not the same (never was) than a scalar Group. +In either case, on Group: "setish" ops (ie allow duplicates on the LHS) +on Label: op on .eval() for y in time[start_year + 1:]: + # expected result (arithmetic via Label) res = a[y + 1] -for c in sutcode.matches('^...$') | sutcode.matches('^..$') - 'ND': - g = sutcode.startswith(c) - c +# expected result (set op) +for c in sutcode.matching('^...$') | sutcode.matching('^..$') - 'ND': + # expected result (set op) + g = sutcode.startingwith(c) - c + +# somewhat UNEXPECTED result (fails: no + defined on Group) +for age in age[1:] + 1: +# somewhat UNEXPECTED result (set op) +for age in age[1:] - 1: + ========================================== ========================================== @@ -1054,5 +1133,3 @@ subset = pop.q('M, sum(10:20 >> yada1, 20:30 >> yada2')) # if using a function (like .q) we could also "rename" axes on the fly. the above would create an aggregated axis # named "age" but the code below would create "toto" instead subset = pop.q('M', toto=age.sum[10:20, 20:30]) - - From ae72b451bad14983146b4861df8a3c239d8d367c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 27 Sep 2018 11:31:24 +0200 Subject: [PATCH 12/44] moved ratio and rationot0 to the Aggregation function section of the API --- doc/source/api.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 9a42ac373..8cc8a3c72 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -357,6 +357,8 @@ Aggregation Functions LArray.ptp LArray.with_total LArray.percent + LArray.ratio + LArray.rationot0 LArray.growth_rate LArray.describe LArray.describe_by @@ -437,8 +439,6 @@ Miscellaneous .. autosummary:: :toctree: _generated/ - LArray.ratio - LArray.rationot0 LArray.divnot0 LArray.clip LArray.shift From b81a6c4c9bacfe5b629570676e5372c6b5c747f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 23 Nov 2018 14:54:27 +0100 Subject: [PATCH 13/44] better error message --- larray/core/axis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index a6c7b0058..5c5644280 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -182,7 +182,7 @@ def labels(self): @labels.setter def labels(self, labels): if labels is None: - raise TypeError("labels should be a sequence or a single int") + raise TypeError("labels should be a sequence or a single int, not None") if isinstance(labels, (int, long, np.integer)): length = labels labels = np.arange(length) From 4c7c9b261191799abfa9f54e820e1f3fb3f9598d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 4 Oct 2018 09:31:37 +0200 Subject: [PATCH 14/44] small code cleanups in various functions incidentally it also makes .reshape support string axes collection... not that it matters much since it should be private method these days anyway --- larray/core/array.py | 17 ++++++++++------- larray/inout/pandas.py | 8 ++++---- larray/random.py | 4 ++-- larray/util/misc.py | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 763a60642..78a0e0c02 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -1332,10 +1332,10 @@ def rename(self, renames=None, to=None, inplace=False, **kwargs): items = [] items += kwargs.items() renames = {self.axes[k]: v for k, v in items} - axes = [a.rename(renames[a]) if a in renames else a - for a in self.axes] + axes = AxisCollection([a.rename(renames[a]) if a in renames else a + for a in self.axes]) if inplace: - self.axes = AxisCollection(axes) + self.axes = axes return self else: return LArray(self.data, axes) @@ -1834,8 +1834,7 @@ def sort_key(axis): key = key[::-1] return axis.i[key] - res = self[tuple(sort_key(axis) for axis in axes)] - return res + return self[tuple(sort_key(axis) for axis in axes)] sort_axis = renamed_to(sort_axes, 'sort_axis') @@ -2077,6 +2076,7 @@ def set(self, value, **kwargs): """ self.__setitem__(kwargs, value) + # TODO: this should be a private method def reshape(self, target_axes): """ Given a list of new axes, changes the shape of the array. @@ -2121,9 +2121,12 @@ def reshape(self, target_axes): # -> 3, 8 WRONG (non adjacent dimensions) # -> 8, 3 WRONG # 4, 3, 2 -> 2, 2, 3, 2 is potentially ok (splitting dim) - data = np.asarray(self).reshape([len(axis) for axis in target_axes]) + if not isinstance(target_axes, AxisCollection): + target_axes = AxisCollection(target_axes) + data = np.asarray(self).reshape(target_axes.shape) return LArray(data, target_axes) + # TODO: this should be a private method def reshape_like(self, target): """ Same as reshape but with an array as input. @@ -8405,7 +8408,7 @@ def raw_broadcastable(values, min_axes=None): """ same as make_numpy_broadcastable but returns numpy arrays """ - arrays, res_axes = make_numpy_broadcastable(values, min_axes) + arrays, res_axes = make_numpy_broadcastable(values, min_axes=min_axes) raw = [a.data if isinstance(a, LArray) else a for a in arrays] return raw, res_axes diff --git a/larray/inout/pandas.py b/larray/inout/pandas.py index c40801977..2a5dc9e8a 100644 --- a/larray/inout/pandas.py +++ b/larray/inout/pandas.py @@ -7,7 +7,7 @@ import pandas as pd from larray.core.array import LArray -from larray.core.axis import Axis +from larray.core.axis import Axis, AxisCollection from larray.core.group import LGroup from larray.core.constants import nan from larray.util.misc import basestring, decode, unique @@ -67,7 +67,7 @@ def cartesian_product_df(df, sort_rows=False, sort_columns=False, fill_value=nan columns = sorted(df.columns) if sort_columns else list(df.columns) # the prodlen test is meant to avoid the more expensive array_equal test prodlen = np.prod([len(axis_labels) for axis_labels in labels]) - if prodlen == len(df) and columns == list(df.columns) and np.array_equal(df.index.values, new_index.values): + if prodlen == len(df) and columns == list(df.columns) and np.array_equal(idx.values, new_index.values): return df, labels return df.reindex(index=new_index, columns=columns, fill_value=fill_value, **kwargs), labels @@ -233,8 +233,8 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo axes_names = [str(name) if name is not None else name for name in axes_names] - axes = [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)] - data = df.values.reshape([len(axis) for axis in axes]) + axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)]) + data = df.values.reshape(axes.shape) return LArray(data, axes, meta=meta) diff --git a/larray/random.py b/larray/random.py index 9437c40ab..08a066e38 100644 --- a/larray/random.py +++ b/larray/random.py @@ -26,7 +26,7 @@ import numpy as np from larray.core.axis import Axis, AxisCollection -from larray.core.array import LArray, aslarray, stack, ndtest +from larray.core.array import LArray, aslarray from larray.core.array import raw_broadcastable import larray as la @@ -35,7 +35,7 @@ def generic_random(np_func, args, min_axes, meta): - args, res_axes = raw_broadcastable(args, min_axes) + args, res_axes = raw_broadcastable(args, min_axes=min_axes) res_data = np_func(*args, size=res_axes.shape) return LArray(res_data, res_axes, meta=meta) diff --git a/larray/util/misc.py b/larray/util/misc.py index 0acf18ecc..e26fc5c16 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -63,7 +63,7 @@ def is_interactive_interpreter(): def csv_open(filename, mode='r'): assert 'b' not in mode and 't' not in mode - if sys.version < '3': + if PY2: return open(filename, mode + 'b') else: return open(filename, mode, newline='', encoding='utf8') From ae905aec4568bc32480e2cf109c31fbdd9f52d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 2 Oct 2018 11:23:09 +0200 Subject: [PATCH 15/44] made IGroupMaker (axis.i) a valid Sequence (added __len__ and check that key < len(axis) in IGroupMaker) --- larray/core/group.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/larray/core/group.py b/larray/core/group.py index 15ce0aa65..fbd2b5228 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -712,8 +712,13 @@ def __init__(self, axis): self.axis = axis def __getitem__(self, key): + if isinstance(key, (int, np.integer)) and not isinstance(self.axis, ABCAxisReference) and key >= len(self.axis): + raise IndexError("{} is out of range for axis of length {}".format(key, len(self.axis))) return IGroup(key, None, self.axis) + def __len__(self): + return len(self.axis) + # We need a separate class for LGroup and cannot simply create a new Axis with a subset of values/ticks/labels: # the subset of ticks/labels of the LGroup need to correspond to its *Axis* indices From f1f1c51d2832efbc1e496d493daebd0e3f17fe4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 23 Nov 2018 11:20:19 +0100 Subject: [PATCH 16/44] WIP: added dtype argument to stack and LArray (need to split commit & add changelog) --- larray/core/array.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 78a0e0c02..9913c078f 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -577,6 +577,8 @@ class LArray(ABCLArray): meta : list of pairs or dict or OrderedDict or Metadata, optional Metadata (title, description, author, creation_date, ...) associated with the array. Keys must be strings. Values must be of type string, int, float, date, time or datetime. + dtype : type, optional + Datatype for the array. Defaults to None (inferred from the data). Attributes ---------- @@ -655,8 +657,8 @@ class LArray(ABCLArray): F 10 11 12 """ - def __init__(self, data, axes=None, title=None, meta=None): - data = np.asarray(data) + def __init__(self, data, axes=None, title=None, meta=None, dtype=None): + data = np.asarray(data, dtype=dtype) ndim = data.ndim if axes is None: axes = AxisCollection(data.shape) @@ -8145,7 +8147,7 @@ def eye(rows, columns=None, k=0, title=None, dtype=None, meta=None): # ('DE', 'M'): 4, ('DE', 'F'): 5}) -def stack(elements=None, axis=None, title=None, meta=None, **kwargs): +def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs): r""" Combines several arrays or sessions along an axis. @@ -8165,6 +8167,8 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): meta : list of pairs or dict or OrderedDict or Metadata, optional Metadata (title, description, author, creation_date, ...) associated with the array. Keys must be strings. Values must be of type string, int, float, date, time or datetime. + dtype : type, optional + Output dtype. Defaults to None (inspect all output values to infer it automatically). Returns ------- @@ -8342,7 +8346,9 @@ def stack(elements=None, axis=None, title=None, meta=None, **kwargs): for v in values] result_axes = AxisCollection.union(*[get_axes(v) for v in values]) result_axes.append(axis) - result = empty(result_axes, dtype=common_type(values), meta=meta) + if dtype is None: + dtype = common_type(values) + result = empty(result_axes, dtype=dtype, meta=meta) for k, v in zip(axis, values): result[k] = v return result From 895241501d231843aa8c6586961e0cb06922ec02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 26 Nov 2018 16:47:11 +0100 Subject: [PATCH 17/44] broadcast ufuncs kwargs --- larray/core/array.py | 12 ++++++++++++ larray/core/ufuncs.py | 12 +++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 9913c078f..cbb2fb472 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -8420,6 +8420,18 @@ def raw_broadcastable(values, min_axes=None): return raw, res_axes +def make_args_broadcastable(args, kwargs=None, min_axes=None): + """ + Make args and kwargs (NumPy) broadcastable between them. + """ + values = (args + tuple(kwargs.values())) if kwargs is not None else args + first_kw = len(args) + raw_bcast_values, res_axes = raw_broadcastable(values, min_axes=min_axes) + raw_bcast_args = raw_bcast_values[:first_kw] + raw_bcast_kwargs = dict(zip(kwargs.keys(), raw_bcast_values[first_kw:])) + return raw_bcast_args, raw_bcast_kwargs, res_axes + + _default_float_error_handler = float_error_handler_factory(3) diff --git a/larray/core/ufuncs.py b/larray/core/ufuncs.py index 1f1d019d6..d59b30af9 100644 --- a/larray/core/ufuncs.py +++ b/larray/core/ufuncs.py @@ -3,15 +3,13 @@ import numpy as np -from larray.core.array import LArray, raw_broadcastable +from larray.core.array import LArray, make_args_broadcastable def broadcastify(func): # intentionally not using functools.wraps, because it does not work for wrapping a function from another module def wrapper(*args, **kwargs): - # TODO: normalize args/kwargs like in LIAM2 so that we can also broadcast if args are given via kwargs - # (eg out=) - raw_args, combined_axes = raw_broadcastable(args) + raw_bcast_args, raw_bcast_kwargs, res_axes = make_args_broadcastable(args, kwargs) # We pass only raw numpy arrays to the ufuncs even though numpy is normally meant to handle those cases itself # via __array_wrap__ @@ -25,9 +23,9 @@ def wrapper(*args, **kwargs): # It fails on "np.minimum(ndarray, LArray)" because it calls __array_wrap__(high, result) which cannot work if # there was broadcasting involved (high has potentially less labels than result). # it does this because numpy calls __array_wrap__ on the argument with the highest __array_priority__ - res_data = func(*raw_args, **kwargs) - if combined_axes: - return LArray(res_data, combined_axes) + res_data = func(*raw_bcast_args, **raw_bcast_kwargs) + if res_axes: + return LArray(res_data, res_axes) else: return res_data # copy meaningful attributes (numpy ufuncs do not have __annotations__ nor __qualname__) From 744709957ab80394477626d7f866362160b1f45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 3 Oct 2018 11:12:32 +0200 Subject: [PATCH 18/44] added SequenceZip as an alternative to builtin zip --- larray/util/misc.py | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/larray/util/misc.py b/larray/util/misc.py index e26fc5c16..ac8058bb3 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -775,3 +775,55 @@ def __enter__(self): def __exit__(self, type_, value, traceback): if self.close_store: self.store.close() + + +class SequenceZip(object): + """ + Represents the "combination" of several sequences. + + This is very similar to python's builtin zip but only accepts sequences and acts as a Sequence (it can be + indexed and has a len). + + Parameters + ---------- + sequences : Iterable of Sequence + Sequences to combine. + + Examples + -------- + >>> z = SequenceZip([['a', 'b', 'c'], [1, 2, 3]]) + >>> for i in range(len(z)): + ... print(z[i]) + ('a', 1) + ('b', 2) + ('c', 3) + >>> for v in z: + ... print(v) + ('a', 1) + ('b', 2) + ('c', 3) + >>> list(z[1:4]) + [('b', 2), ('c', 3)] + """ + def __init__(self, sequences): + self.sequences = sequences + length = len(sequences[0]) + bad_length_seqs = [i for i, s in enumerate(sequences[1:], start=1) if len(s) != length] + if bad_length_seqs: + first_bad = bad_length_seqs[0] + raise ValueError("sequence {} has a length of {} which is different from the length of the " + "first sequence ({})".format(first_bad, len(sequences[first_bad]), length)) + self._length = length + + def __len__(self): + return self._length + + def __getitem__(self, key): + if isinstance(key, (int, np.integer)): + return tuple(seq[key] for seq in self.sequences) + else: + assert isinstance(key, slice), "key (%s) has invalid type (%s)" % (key, type(key)) + return SequenceZip([seq[key] for seq in self.sequences]) + + def __iter__(self): + return zip(*self.sequences) From 48285382210c9ce4a02a833770be5aa7893d3353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 4 Oct 2018 09:18:41 +0200 Subject: [PATCH 19/44] added Product class (from larray_editor) to make product for Sequences similar to itertools.product but can be indexed and has a length --- larray/util/misc.py | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/larray/util/misc.py b/larray/util/misc.py index ac8058bb3..1529c89ad 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -827,3 +827,67 @@ def __getitem__(self, key): def __iter__(self): return zip(*self.sequences) + + +# TODO: remove Product from larray_editor.utils (it is almost identical) +class Product(object): + """ + Represents the `cartesian product` of several sequences. + + This is very similar to itertools.product but only accepts sequences and acts as a sequence (it can be + indexed and has a len). + + Parameters + ---------- + sequences : Iterable of Sequence + Sequences on which to apply the cartesian product. + + Examples + -------- + >>> p = Product([['a', 'b', 'c'], [1, 2]]) + >>> for i in range(len(p)): + ... print(p[i]) + ('a', 1) + ('a', 2) + ('b', 1) + ('b', 2) + ('c', 1) + ('c', 2) + >>> p[1:4] + [('a', 2), ('b', 1), ('b', 2)] + >>> p[-3:] + [('b', 2), ('c', 1), ('c', 2)] + >>> list(p) + [('a', 1), ('a', 2), ('b', 1), ('b', 2), ('c', 1), ('c', 2)] + """ + def __init__(self, sequences): + self.sequences = sequences + assert len(sequences) + shape = [len(a) for a in self.sequences] + self._div_mod = [(int(np.prod(shape[i + 1:])), shape[i]) + for i in range(len(shape))] + self._length = np.prod(shape) + + def __len__(self): + return self._length + + def __getitem__(self, key): + if isinstance(key, (int, np.integer)): + if key >= self._length: + raise IndexError("index %d out of range for Product of length %d" % (key, self._length)) + # this is similar to np.unravel_index but a tad faster for scalars + return tuple(array[key // div % mod] + for array, (div, mod) in zip(self.sequences, self._div_mod)) + else: + assert isinstance(key, slice), "key (%s) has invalid type (%s)" % (key, type(key)) + start, stop, step = key.indices(self._length) + div_mod = self._div_mod + arrays = self.sequences + # XXX: we probably want to return another Product object with an updated start/stop to stay + # lazy in that case too. + return [tuple(array[idx // div % mod] + for array, (div, mod) in zip(arrays, div_mod)) + for idx in range(start, stop, step)] + + def __iter__(self): + return product(*self.sequences) From 7441a8085e53932841a5ca10bc6636fd90ea78fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Thu, 13 Dec 2018 15:32:42 +0100 Subject: [PATCH 20/44] WIP: implemented Repeater (need to split Product.__repr__ out of this commit) --- larray/util/misc.py | 75 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/larray/util/misc.py b/larray/util/misc.py index 1529c89ad..40968e2e0 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -829,6 +829,78 @@ def __iter__(self): return zip(*self.sequences) +class Repeater(object): + """ + Returns a virtual sequence with value repeated n times. + The sequence is never actually created in memory. + + Parameters + ---------- + value : any + Value to repeat. + n : int + Number of times to repeat value. + + Notes + ----- + This is very similar to itertools.repeat except this version returns a Sequence instead of an iterator, + meaning it has a length and can be indexed. + + Examples + -------- + >>> r = Repeater('a', 3) + >>> list(r) + ['a', 'a', 'a'] + >>> r[0] + 'a' + >>> r[2] + 'a' + >>> r[3] + Traceback (most recent call last): + ... + IndexError: index out of range + >>> r[-1] + 'a' + >>> r[-3] + 'a' + >>> r[-4] + Traceback (most recent call last): + ... + IndexError: index out of range + >>> len(r) + 3 + >>> list(r[1:]) + ['a', 'a'] + >>> list(r[:2]) + ['a', 'a'] + >>> list(r[10:]) + [] + """ + def __init__(self, value, n): + self.value = value + self.n = n + + def __len__(self): + return self.n + + def __getitem__(self, key): + if isinstance(key, (int, np.integer)): + if key >= self.n or key < -self.n: + raise IndexError('index out of range') + return self.value + else: + assert isinstance(key, slice), "key (%s) has invalid type (%s)" % (key, type(key)) + start, stop, step = key.indices(self.n) + # XXX: unsure // step is correct + return Repeater(self.value, (stop - start) // step) + + def __iter__(self): + return itertools.repeat(self.value, self.n) + + def __repr__(self): + return 'Repeater({}, {})'.format(self.value, self.n) + + # TODO: remove Product from larray_editor.utils (it is almost identical) class Product(object): """ @@ -891,3 +963,6 @@ def __getitem__(self, key): def __iter__(self): return product(*self.sequences) + + def __repr__(self): + return 'Product({})'.format(self.sequences) From cafa35d1ae641c61051e52f949e859808c57b0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 15:43:08 +0200 Subject: [PATCH 21/44] allow creating a Session from any object having a .items method (e.g. another Session) --- larray/core/session.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/larray/core/session.py b/larray/core/session.py index c40ebfcb7..b2b6ebe24 100644 --- a/larray/core/session.py +++ b/larray/core/session.py @@ -94,10 +94,12 @@ def __init__(self, *args, **kwargs): if isinstance(a0, str): # assume a0 is a filename self.load(a0) + elif hasattr(a0, 'items'): + for k, v in a0.items(): + self[k] = v else: - items = a0.items() if isinstance(a0, dict) else a0 # assume we have an iterable of tuples - for k, v in items: + for k, v in a0: self[k] = v else: self.add(*args, **kwargs) From 524ca0e57238492857097e1a22642af5ffc5a1ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 15:57:39 +0200 Subject: [PATCH 22/44] optimized iteration speed over LArray, Group and Axis --- larray/core/array.py | 25 +++++++++++++++---------- larray/core/axis.py | 2 +- larray/core/group.py | 3 ++- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index cbb2fb472..7965dc9f6 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -311,20 +311,21 @@ def concat(arrays, axis=0, dtype=None): class LArrayIterator(object): def __init__(self, array): - self.array = array - self.index = 0 + data_iter = iter(array.data) + self.nextfunc = data_iter.__next__ + self.axes = array.axes[1:] def __iter__(self): return self def __next__(self): - array = self.array - if self.index == len(self.array): - raise StopIteration - # result = array.i[array.axes[0].i[self.index]] - result = array.i[self.index] - self.index += 1 - return result + data = self.nextfunc() + axes = self.axes + if len(axes): + return LArray(data, axes) + else: + return data + # Python 2 next = __next__ @@ -2290,7 +2291,11 @@ def __str__(self): __repr__ = __str__ def __iter__(self): - return LArrayIterator(self) + # fast path for 1D arrays where we return elements + if self.ndim <= 1: + return iter(self.data) + else: + return LArrayIterator(self) def __contains__(self, key): return any(key in axis for axis in self.axes) diff --git a/larray/core/axis.py b/larray/core/axis.py index 5c5644280..62dc179aa 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -721,7 +721,7 @@ def __len__(self): return self._length def __iter__(self): - return iter([self.i[i] for i in range(self._length)]) + return iter([IGroup(i, None, self) for i in range(self._length)]) def __getitem__(self, key): """ diff --git a/larray/core/group.py b/larray/core/group.py index fbd2b5228..b9b86929e 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -870,7 +870,8 @@ def __len__(self): def __iter__(self): # XXX: use translate/IGroup instead, so that it works even in the presence of duplicate labels # possibly, only if axis is set? - return iter([LGroup(v, axis=self.axis) for v in self.eval()]) + axis = self.axis + return iter([LGroup(v, axis=axis) for v in self.eval()]) def named(self, name): """Returns group with a different name. From 6cb8a8b4b135e94424720f50b051418ce74d54fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 16:24:22 +0200 Subject: [PATCH 23/44] slightly faster AxisCollection.combine_axes --- larray/core/axis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 62dc179aa..06ec1c214 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -2904,9 +2904,9 @@ def combine_axes(self, axes=None, sep='_', wildcard=False, front_if_spread=False # Q: if axis is a wildcard axis, should the result be a wildcard axis (and axes_labels discarded?) combined_labels = _axes[0].labels else: - combined_labels = [sep.join(str(l) for l in p) - for p in product(*_axes.labels)] - + sepjoin = sep.join + axes_labels = [np.array(l, np.str, copy=False) for l in _axes.labels] + combined_labels = [sepjoin(p) for p in product(*axes_labels)] combined_axis = Axis(combined_labels, combined_name) new_axes = new_axes - _axes new_axes.insert(combined_axis_pos, combined_axis) From 8f1f886e76276fa67ec528de1ad8e11ccbee6bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 16:05:56 +0200 Subject: [PATCH 24/44] added __slots__ to all our structures this makes attribute access a tad faster and uses less memory (because it does not create a __dict__ for a each instance) --- larray/core/array.py | 14 +++++++++++--- larray/core/axis.py | 10 +++++++--- larray/core/group.py | 14 +++++++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 7965dc9f6..4587c081d 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -310,6 +310,8 @@ def concat(arrays, axis=0, dtype=None): class LArrayIterator(object): + __slots__ = ('nextfunc', 'axes') + def __init__(self, array): data_iter = iter(array.data) self.nextfunc = data_iter.__next__ @@ -332,6 +334,7 @@ def __next__(self): # TODO: rename to LArrayIndexIndexer or something like that class LArrayPositionalIndexer(object): + __slots__ = ('array',) """ numpy indexing *except* we index the cross product """ @@ -362,6 +365,8 @@ def __len__(self): class LArrayPointsIndexer(object): + __slots__ = ('array',) + def __init__(self, array): self.array = array @@ -383,6 +388,7 @@ def __setitem__(self, key, value): # TODO: rename to LArrayIndexPointsIndexer or something like that class LArrayPositionalPointsIndexer(object): + __slots__ = ('array',) """ the closest to numpy indexing we get, but not 100% the same. """ @@ -657,6 +663,7 @@ class LArray(ABCLArray): M 10 9 8 F 10 11 12 """ + __slots__ = ('data', 'axes', '_meta') def __init__(self, data, axes=None, title=None, meta=None, dtype=None): data = np.asarray(data, dtype=dtype) @@ -886,14 +893,15 @@ def __getattr__(self, key): # needed to make *un*pickling work (because otherwise, __getattr__ is called before .axes exists, which leads to # an infinite recursion) def __getstate__(self): - return self.__dict__ + return self.data, self.axes, self._meta def __setstate__(self, d): - self.__dict__ = d + self.data, self.axes, self._meta = d def __dir__(self): axis_names = set(axis.name for axis in self.axes if axis.name is not None) - return list(set(dir(self.__class__)) | set(self.__dict__.keys()) | axis_names) + attributes = self.__slots__ + return list(set(dir(self.__class__)) | set(attributes) | axis_names) def _ipython_key_completions_(self): return list(chain(*[list(labels) for labels in self.axes.labels])) diff --git a/larray/core/axis.py b/larray/core/axis.py index 06ec1c214..11258e505 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -75,6 +75,8 @@ class Axis(ABCAxis): >>> anonymous Axis([0, 1, 2, 3, 4], None) """ + __slots__ = ('name', '__mapping', '__sorted_keys', '__sorted_values', '_labels', '_length', '_iswildcard') + # ticks instead of labels? def __init__(self, labels, name=None): if isinstance(labels, Group) and name is None: @@ -1371,6 +1373,7 @@ def _make_axis(obj): # not using namedtuple because we have to know the fields in advance (it is a one-off class) and we need more # functionality than just a named tuple class AxisCollection(object): + __slots__ = ('_list', '_map') """ Represents a collection of axes. @@ -1464,10 +1467,11 @@ def __getattr__(self, key): # needed to make *un*pickling work (because otherwise, __getattr__ is called before _map exists, which leads to # an infinite recursion) def __getstate__(self): - return self.__dict__ + return self._list - def __setstate__(self, d): - self.__dict__ = d + def __setstate__(self, state): + self._list = state + self._map = {axis.name: axis for axis in state if axis.name is not None} def __getitem__(self, key): if isinstance(key, Axis): diff --git a/larray/core/group.py b/larray/core/group.py index b9b86929e..cb3db85c3 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -707,6 +707,8 @@ class IGroupMaker(object): ----- This class is used by the method `Axis.i` """ + __slots__ = ('axis',) + def __init__(self, axis): assert isinstance(axis, ABCAxis) self.axis = axis @@ -725,6 +727,8 @@ def __len__(self): class Group(object): """Abstract Group. """ + __slots__ = ('key', 'name', 'axis') + format_string = None def __init__(self, key, name=None, axis=None): @@ -1485,7 +1489,8 @@ def __array__(self, dtype=None): def __dir__(self): # called by dir() and tab-completion at the interactive prompt, must return a list of any valid getattr key. # dir() takes care of sorting but not uniqueness, so we must ensure that. - return list(set(dir(self.eval())) | set(self.__dict__.keys()) | set(dir(self.__class__))) + attributes = self.__slots__ + return list(set(dir(self.eval())) | set(attributes) | set(dir(self.__class__))) def __getattr__(self, key): if key == '__array_struct__': @@ -1496,10 +1501,10 @@ def __getattr__(self, key): # needed to make *un*pickling work (because otherwise, __getattr__ is called before .key exists, which leads to # an infinite recursion) def __getstate__(self): - return self.__dict__ + return (self.key, self.name, self.axis) def __setstate__(self, d): - self.__dict__ = d + self.key, self.name, self.axis = d def __hash__(self): # to_tick & to_key are partially opposite operations but this standardize on a single notation so that they can @@ -1557,6 +1562,7 @@ class LGroup(Group): >>> teens X.age[10:19] >> 'teens' """ + __slots__ = () format_string = "{axis}[{key}]" def __init__(self, key, name=None, axis=None): @@ -1616,6 +1622,7 @@ class LSet(LGroup): >>> abc & letters['b:d'] letters['b', 'c'].set() """ + __slots__ = () format_string = "{axis}[{key}].set()" def __init__(self, key, name=None, axis=None): @@ -1678,6 +1685,7 @@ class IGroup(Group): axis : int, str, Axis, optional Axis for group. """ + __slots__ = () format_string = "{axis}.i[{key}]" def translate(self, bound=None, stop=False): From 4c64564cbe2cfae0712addac93eca91aa7a300e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 16:41:33 +0200 Subject: [PATCH 25/44] implemented faster metadata handling .meta can be None and Metadata() is only created when needed --- larray/core/array.py | 56 +++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 4587c081d..f724cb100 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -558,15 +558,27 @@ def nan_equal(a1, a2): def _handle_deprecated_argument_title(meta, title): - if meta is None: - meta = Metadata() if title is not None: + if meta is None: + meta = Metadata() import warnings warnings.warn("title argument is deprecated. Please use meta argument instead", FutureWarning, stacklevel=3) meta['title'] = title return meta +# make sure meta is either None or a Metadata instance +def _handle_meta(meta, title): + if title is not None: + meta = _handle_deprecated_argument_title(meta, title) + if meta is None or isinstance(meta, Metadata): + return meta + if not isinstance(meta, (list, dict, OrderedDict)): + raise TypeError("Expected None, list of pairs, dict, OrderedDict or Metadata object " + "instead of {}".format(type(meta).__name__)) + return Metadata(meta) + + class LArray(ABCLArray): """ A LArray object represents a multidimensional, homogeneous array of fixed-size items with labeled axes. @@ -684,14 +696,14 @@ def __init__(self, data, axes=None, title=None, meta=None, dtype=None): self.data = data self.axes = axes - meta = _handle_deprecated_argument_title(meta, title) - self.meta = meta + meta = _handle_meta(meta, title) + self._meta = meta @property def title(self): import warnings warnings.warn("title attribute is deprecated. Please use meta.title instead", FutureWarning, stacklevel=2) - return self._meta.title if 'title' in self._meta else None + return self._meta.title if self._meta is not None and 'title' in self._meta else None @title.setter def title(self, title): @@ -710,14 +722,13 @@ def meta(self): Metadata: Metadata of the array. """ + if self._meta is None: + self._meta = Metadata() return self._meta @meta.setter def meta(self, meta): - if not isinstance(meta, (list, dict, OrderedDict, Metadata)): - raise TypeError("Expected list of pairs or dict or OrderedDict or Metadata object " - "instead of {}".format(type(meta).__name__)) - self._meta = meta if isinstance(meta, Metadata) else Metadata(meta) + self._meta = _handle_meta(meta, None) # TODO: rename to posnonzero and implement a label version of nonzero # TODO: implement wildcard argument to avoid producing the combined labels @@ -7279,7 +7290,8 @@ def zeros(axes, title=None, dtype=float, order='C', meta=None): BE 0.0 0.0 FO 0.0 0.0 """ - meta = _handle_deprecated_argument_title(meta, title) + # FIXME: the error message is wrong (stackdepth is wrong) because of _check_axes_argument + meta = _handle_meta(meta, title) axes = AxisCollection(axes) return LArray(np.zeros(axes.shape, dtype, order), axes, meta=meta) @@ -7315,7 +7327,7 @@ def zeros_like(array, title=None, dtype=None, order='K', meta=None): a0 0 0 0 a1 0 0 0 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) return LArray(np.zeros_like(array, dtype, order), array.axes, meta=meta) @@ -7351,7 +7363,7 @@ def ones(axes, title=None, dtype=float, order='C', meta=None): BE 1.0 1.0 FO 1.0 1.0 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) axes = AxisCollection(axes) return LArray(np.ones(axes.shape, dtype, order), axes, meta=meta) @@ -7387,7 +7399,7 @@ def ones_like(array, title=None, dtype=None, order='K', meta=None): a0 1 1 1 a1 1 1 1 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) axes = array.axes return LArray(np.ones_like(array, dtype, order), axes, meta=meta) @@ -7424,7 +7436,7 @@ def empty(axes, title=None, dtype=float, order='C', meta=None): BE 2.47311483356e-315 2.47498446195e-315 FO 0.0 6.07684618082e-31 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) axes = AxisCollection(axes) return LArray(np.empty(axes.shape, dtype, order), axes, meta=meta) @@ -7461,7 +7473,7 @@ def empty_like(array, title=None, dtype=None, order='K', meta=None): a1 1.06099789568e-313 1.48539705397e-313 a2 1.90979621226e-313 2.33419537056e-313 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) # cannot use empty() because order == 'K' is not understood return LArray(np.empty_like(array.data, dtype, order), array.axes, meta=meta) @@ -7508,7 +7520,7 @@ def full(axes, fill_value, title=None, dtype=None, order='C', meta=None): BE 0 1 FO 0 1 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) if isinstance(fill_value, Axis): raise ValueError("If you want to pass several axes or dimension lengths to full, you must pass them as a " "list (using []) or tuple (using()).") @@ -7552,7 +7564,7 @@ def full_like(array, fill_value, title=None, dtype=None, order='K', meta=None): a0 5 5 5 a1 5 5 5 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) # cannot use full() because order == 'K' is not understood # cannot use np.full_like() because it would not handle LArray fill_value res = empty_like(array, dtype=dtype, meta=meta) @@ -7666,7 +7678,7 @@ def sequence(axis, initial=0, inc=None, mult=1, func=None, axes=None, title=None year 2016 2017 2018 2019 1.0 2.0 3.0 3.0 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) if inc is None: inc = 1 if mult is 1 else 0 @@ -7863,7 +7875,7 @@ def ndtest(shape_or_axes, start=0, label_start=0, title=None, dtype=int, meta=No BE 0 1 FO 2 3 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) # XXX: try to come up with a syntax where start is before "end". # For ndim > 1, I cannot think of anything nice. if isinstance(shape_or_axes, int): @@ -8014,7 +8026,7 @@ def labels_array(axes, title=None, meta=None): # nat\\sex M F # BE BE,M BE,F # FO FO,M FO,F - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) axes = AxisCollection(axes) if len(axes) > 1: res_axes = axes + Axis(axes.names, 'axis') @@ -8084,7 +8096,7 @@ def eye(rows, columns=None, k=0, title=None, dtype=None, meta=None): 1 0.0 0.0 1.0 2 0.0 0.0 0.0 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) if columns is None: columns = rows.copy() if isinstance(rows, Axis) else rows axes = AxisCollection([rows, columns]) @@ -8283,7 +8295,7 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) M 0.0 0.5 F 0.0 0.5 """ - meta = _handle_deprecated_argument_title(meta, title) + meta = _handle_meta(meta, title) from larray import Session From 3d4684f20bab043de39689f140603abf5143445a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Sep 2018 07:57:08 +0200 Subject: [PATCH 26/44] factorized a unique_multi helper function it computes unique values across multiple iterables --- larray/core/array.py | 5 +---- larray/core/axis.py | 8 ++------ larray/util/misc.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index f724cb100..8b26cbbd7 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -8352,10 +8352,7 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) if not all(isinstance(s, Session) for s in sessions): raise TypeError("stack() only supports stacking Session with other Session objects") - seen = set() - all_keys = [] - for s in sessions: - unique_list(s.keys(), all_keys, seen) + all_keys = unique_multi(s.keys() for s in sessions) res = [] for name in all_keys: try: diff --git a/larray/core/axis.py b/larray/core/axis.py index 11258e505..849c7edb8 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -16,7 +16,7 @@ _range_to_slice, _seq_group_to_name, _translate_group_key_hdf, remove_nested_groups) from larray.util.oset import * from larray.util.misc import (basestring, PY2, unicode, long, duplicates, array_lookup2, ReprString, index_by_id, - renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice) + renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi) class Axis(ABCAxis): @@ -1145,11 +1145,7 @@ def union(self, other): other = _to_ticks(other, parse_single_int=True) if '..' in other or ',' in other else [other] if isinstance(other, Axis): other = other.labels - unique_labels = [] - seen = set() - unique_list(self.labels, unique_labels, seen) - unique_list(other, unique_labels, seen) - return Axis(unique_labels, self.name) + return Axis(unique_multi((self.labels, other)), self.name) def intersection(self, other): """Returns axis with the (set) intersection of this axis labels and other labels. diff --git a/larray/util/misc.py b/larray/util/misc.py index 40968e2e0..dd2a6b7c7 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -234,6 +234,17 @@ def unique_list(iterable, res=None, seen=None): return res +def unique_multi(iterable_of_iterables): + """ + Returns a list of all unique elements across multiple iterables. Elements of earlier iterables will come first. + """ + seen = set() + res = [] + for iterable in iterable_of_iterables: + unique_list(iterable, res, seen) + return res + + def duplicates(iterable): """ List duplicated elements once, preserving order. Remember all elements ever seen. From 12717dd0030f603c81255185bdadae3c0939be59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 26 Sep 2018 10:56:29 +0200 Subject: [PATCH 27/44] simplified AxisCollection._flat_lookup by using np.unravel_index --- larray/core/axis.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 849c7edb8..d5c588077 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -3153,14 +3153,12 @@ def _flat_lookup(self, flat_indices): from larray.core.array import aslarray, LArray, stack flat_indices = aslarray(flat_indices) - shape = self.shape - divisors = np.roll(np.cumprod(shape[::-1])[::-1], -1) - divisors[-1] = 1 - axes_indices = [(flat_indices // div) % length for div, length in zip(divisors, shape)] + axes_indices = np.unravel_index(flat_indices, self.shape) # This could return an LArray with object dtype because axes labels can have different types (but not length) # TODO: this should be: # return stack([(axis.name, axis.i[inds]) for axis, inds in zip(axes, axes_indices)], axis='axis') - return stack([(axis.name, LArray(axis.labels[inds], inds.axes)) for axis, inds in zip(self, axes_indices)], + flat_axes = flat_indices.axes + return stack([(axis.name, LArray(axis.labels[inds], flat_axes)) for axis, inds in zip(self, axes_indices)], axis='axis') def _adv_keys_to_combined_axis_la_keys(self, key, wildcard=False, sep='_'): From 80564e9ef4cce168e59d03c261ae7af0d66c70fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 3 Oct 2018 17:36:33 +0200 Subject: [PATCH 28/44] simplified _adv_keys_to_combined_axis_la_keys --- larray/core/axis.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index d5c588077..7ce0ae1d4 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -3189,25 +3189,27 @@ def _adv_keys_to_combined_axis_la_keys(self, key, wildcard=False, sep='_'): # TODO: use/factorize with AxisCollection.combine_axes. The problem is that it uses product(*axes_labels) # while here we need zip(*axes_labels) ignored_types = (int, np.integer, slice, LArray) - adv_key_axes = [axis for axis_key, axis in zip(key, self) - if not isinstance(axis_key, ignored_types)] - if not adv_key_axes: + adv_keys = [(axis_key, axis) for axis_key, axis in zip(key, self) + if not isinstance(axis_key, ignored_types)] + if not adv_keys: return key # axes with a scalar key are not taken, since we want to kill them # all anonymous axes => anonymous combined axis - if all(axis.name is None for axis in adv_key_axes): + if all(axis.name is None for axis_key, axis in adv_keys): combined_name = None else: # using axis_id instead of name to allow combining a mix of anonymous & non anonymous axes - combined_name = sep.join(str(self.axis_id(axis)) for axis in adv_key_axes) + combined_name = sep.join(str(self.axis_id(axis)) for axis_key, axis in adv_keys) + + # explicitly check that all combined keys have the same length + first_key, first_axis = adv_keys[0] + combined_axis_len = len(first_key) + if not all(len(axis_key) == combined_axis_len for axis_key, axis in adv_keys[1:]): + raise ValueError("all combined keys should have the same length") if wildcard: - lengths = [len(axis_key) for axis_key in key - if not isinstance(axis_key, ignored_types)] - combined_axis_len = lengths[0] - assert all(l == combined_axis_len for l in lengths) combined_axis = Axis(combined_axis_len, combined_name) else: # TODO: the combined keys should be objects which display as: @@ -3217,31 +3219,24 @@ def _adv_keys_to_combined_axis_la_keys(self, key, wildcard=False, sep='_'): # A: yes, probably. On the Pandas backend, we could/should have # separate axes. On the numpy backend we cannot. # TODO: only convert if - if len(adv_key_axes) == 1: - # we don't convert to string when there is only a single axis + if len(adv_keys) == 1: + # we do not convert to string when there is only a single axis axes_labels = [axis.labels[axis_key] - for axis_key, axis in zip(key, self) - if not isinstance(axis_key, ignored_types)] + for axis_key, axis in adv_keys] # Q: if axis is a wildcard axis, should the result be a # wildcard axis (and axes_labels discarded?) combined_labels = axes_labels[0] else: axes_labels = [axis.labels.astype(np.str, copy=False)[axis_key].tolist() - for axis_key, axis in zip(key, self) - if not isinstance(axis_key, ignored_types)] + for axis_key, axis in adv_keys] sepjoin = sep.join combined_labels = [sepjoin(comb) for comb in zip(*axes_labels)] combined_axis = Axis(combined_labels, combined_name) # 2) transform all advanced non-LArray keys to LArray with the combined axis # ========================================================================== - def to_la_key(axis_key, combined_axis): - if isinstance(axis_key, (int, np.integer, slice, LArray)): - return axis_key - else: - assert len(axis_key) == len(combined_axis) - return LArray(axis_key, combined_axis) - return tuple(to_la_key(axis_key, combined_axis) for axis_key in key) + return tuple(axis_key if isinstance(axis_key, ignored_types) else LArray(axis_key, combined_axis) + for axis_key in key) class AxisReference(ABCAxisReference, ExprNode, Axis): From 4af9381f1e6db84b221fc0297c751121bf7b1417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 8 Oct 2018 17:31:50 +0200 Subject: [PATCH 29/44] simplified FileHandler._read_item by returning only the value, not the key (the key wasn't used anywhere) --- larray/inout/common.py | 3 +-- larray/inout/csv.py | 6 +++--- larray/inout/excel.py | 12 ++++++------ larray/inout/hdf.py | 2 +- larray/inout/pickle.py | 4 ++-- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/larray/inout/common.py b/larray/inout/common.py index a283571a2..b3efaf915 100644 --- a/larray/inout/common.py +++ b/larray/inout/common.py @@ -127,8 +127,7 @@ def read(self, keys, *args, **kwargs): if display: print("loading", type, "object", key, "...", end=' ') try: - key, item = self._read_item(key, type, *args, **kwargs) - res[key] = item + res[key] = self._read_item(key, type, *args, **kwargs) except Exception: if not ignore_exceptions: raise diff --git a/larray/inout/csv.py b/larray/inout/csv.py index 07836b1ba..0f89b736b 100644 --- a/larray/inout/csv.py +++ b/larray/inout/csv.py @@ -342,11 +342,11 @@ def list_items(self): def _read_item(self, key, type, *args, **kwargs): if type == 'Array': - return key, read_csv(self._to_filepath(key), *args, **kwargs) + return read_csv(self._to_filepath(key), *args, **kwargs) elif type == 'Axis': - return key, self.axes[key] + return self.axes[key] elif type == 'Group': - return key, self.groups[key] + return self.groups[key] else: raise TypeError() diff --git a/larray/inout/excel.py b/larray/inout/excel.py index 9eecfffa8..69417bb33 100644 --- a/larray/inout/excel.py +++ b/larray/inout/excel.py @@ -282,11 +282,11 @@ def list_items(self): def _read_item(self, key, type, *args, **kwargs): if type == 'Array': df = self.handle.parse(key, *args, **kwargs) - return key, df_aslarray(df, raw=True) + return df_aslarray(df, raw=True) elif type == 'Axis': - return key, self.axes[key] + return self.axes[key] elif type == 'Group': - return key, self.groups[key] + return self.groups[key] else: raise TypeError() @@ -386,11 +386,11 @@ def list_items(self): def _read_item(self, key, type, *args, **kwargs): if type == 'Array': - return key, self.handle[key].load(*args, **kwargs) + return self.handle[key].load(*args, **kwargs) elif type == 'Axis': - return key, self.axes[key] + return self.axes[key] elif type == 'Group': - return key, self.groups[key] + return self.groups[key] else: raise TypeError() diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 0ce326a9d..f5d656603 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -126,7 +126,7 @@ def _read_item(self, key, type, *args, **kwargs): kwargs['name'] = key else: raise TypeError() - return key, read_hdf(self.handle, hdf_key, *args, **kwargs) + return read_hdf(self.handle, hdf_key, *args, **kwargs) def _dump_item(self, key, value, *args, **kwargs): if isinstance(value, LArray): diff --git a/larray/inout/pickle.py b/larray/inout/pickle.py index 1c8085494..8b4c98501 100644 --- a/larray/inout/pickle.py +++ b/larray/inout/pickle.py @@ -34,8 +34,8 @@ def list_items(self): return items def _read_item(self, key, type, *args, **kwargs): - if type in ['Array', 'Axis', 'Group']: - return key, self.data[key] + if type in {'Array', 'Axis', 'Group'}: + return self.data[key] else: raise TypeError() From a10358f79546b180cc38682f3a9a8a0b1e33fd9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 24 Sep 2018 12:43:27 +0200 Subject: [PATCH 30/44] implemented read_stata and LArray.to_stata --- doc/source/api.rst | 2 ++ doc/source/changes/version_0_30.rst.inc | 18 ++++++++++++++ larray/__init__.py | 3 ++- larray/core/array.py | 16 ++++++++++++ larray/inout/stata.py | 33 +++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 larray/inout/stata.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 8cc8a3c72..44512920a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -623,6 +623,7 @@ Read read_hdf read_eurostat read_sas + read_stata Write ----- @@ -633,6 +634,7 @@ Write LArray.to_csv LArray.to_excel LArray.to_hdf + LArray.to_stata Excel ===== diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc index dde69b6d3..e550fba5a 100644 --- a/doc/source/changes/version_0_30.rst.inc +++ b/doc/source/changes/version_0_30.rst.inc @@ -100,6 +100,24 @@ Backward incompatible changes New features ------------ +* implemented :py:obj:`read_stata()` and :py:obj:`LArray.to_stata()` to read arrays from and write arrays to Stata .dta + files. + + >>> arr = ndtest((2, 3)) + >>> arr + a\b b0 b1 b2 + a0 0 1 2 + a1 3 4 5 + >>> arr.to_stata('test.dta') + >>> read_stata('test.dta') + {0}\{1} a b0 b1 b2 + 0 a0 0 1 2 + 1 a1 3 4 5 + >>> read_stata('test.dta', index_col='a') + a\{1} b0 b1 b2 + a0 0 1 2 + a1 3 4 5 + * added :py:obj:`LArray.isin()` method to check whether each element of an array is contained in a list (or array) of values. diff --git a/larray/__init__.py b/larray/__init__.py index 169d07fa5..77e5494dc 100644 --- a/larray/__init__.py +++ b/larray/__init__.py @@ -28,6 +28,7 @@ from larray.inout.excel import read_excel from larray.inout.hdf import read_hdf from larray.inout.sas import read_sas +from larray.inout.stata import read_stata from larray.inout.xw_excel import open_excel, Workbook from larray.viewer import view, edit, compare @@ -67,7 +68,7 @@ 'real_if_close', 'interp', 'isnan', 'isinf', 'inverse', # inout 'from_lists', 'from_string', 'from_frame', 'from_series', 'read_csv', 'read_tsv', - 'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'open_excel', 'Workbook', + 'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'read_stata', 'open_excel', 'Workbook', # viewer 'view', 'edit', 'compare', # ipfp diff --git a/larray/core/array.py b/larray/core/array.py index 8b26cbbd7..7f1a874ca 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -6210,6 +6210,22 @@ def to_hdf(self, filepath, key): store.get_storer(key).attrs.type = 'Array' self.meta.to_hdf(store, key) + def to_stata(self, filepath_or_buffer, **kwargs): + """ + Writes array to a Stata .dta file. + + Parameters + ---------- + filepath_or_buffer : str or file-like object + Path to .dta file or a file handle. + + Examples + -------- + >>> arr = ndtest((2, 3)) + >>> arr.to_stata('test.dta') # doctest: +SKIP + """ + self.to_frame().to_stata(filepath_or_buffer, **kwargs) + @deprecate_kwarg('sheet_name', 'sheet') def to_excel(self, filepath=None, sheet=None, position='A1', overwrite_file=False, clear_sheet=False, header=True, transpose=False, wide=True, value_name='value', engine=None, *args, **kwargs): diff --git a/larray/inout/stata.py b/larray/inout/stata.py new file mode 100644 index 000000000..9f79cfc8c --- /dev/null +++ b/larray/inout/stata.py @@ -0,0 +1,33 @@ +from __future__ import absolute_import, print_function + +import pandas as pd + +from larray.inout.pandas import from_frame + +__all__ = ['read_stata'] + + +def read_stata(filepath_or_buffer, index_col=None, sort_rows=False, sort_columns=False, + **kwargs): + """ + Reads Stata .dta file and returns an LArray with the contents + + Parameters + ---------- + filepath_or_buffer : str or file-like object + Path to .dta file or a file handle. + index_col : str or None, optional + Name of column to set as index. Defaults to None. + sort_rows : bool, optional + Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). + This only makes sense in combination with index_col. Defaults to False. + sort_columns : bool, optional + Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). + Defaults to False. + + Returns + ------- + LArray + """ + df = pd.read_stata(filepath_or_buffer, index_col=index_col, **kwargs) + return from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns) From 6b7d6c695f4947bc78d174d0126844319081ff3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 5 Nov 2018 15:25:42 +0100 Subject: [PATCH 31/44] implemented LArray.dump(light=True) --- larray/core/array.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 7f1a874ca..b2bfaa3de 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2432,7 +2432,7 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam # returns next line (labels of N-1 first axes + data) yield list(tick) + dataline.tolist() - def dump(self, header=True, wide=True, value_name='value'): + def dump(self, header=True, wide=True, value_name='value', light=False): """Dump array as a 2D nested list Parameters @@ -2446,6 +2446,9 @@ def dump(self, header=True, wide=True, value_name='value'): value_name : str, optional Name of the column containing the values (last column) when `wide=False` (see above). Not used if header=False. Defaults to 'value'. + light : bool, optional + Whether or not to hide repeated labels. In other words, only show a label if it is different from the + previous one. Defaults to False. Returns ------- @@ -2455,7 +2458,7 @@ def dump(self, header=True, wide=True, value_name='value'): # flatten all dimensions except the last one return self.data.reshape(-1, self.shape[-1]).tolist() else: - return list(self.as_table(wide=wide, value_name=value_name)) + return list(self.as_table(wide=wide, value_name=value_name, light=light)) # XXX: should filter(geo=['W']) return a view by default? (collapse=True) # I think it would be dangerous to make it the default From 44ae2a215aaae6ed37a17a92b5ef4253d3243003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 17 Oct 2018 10:02:14 +0200 Subject: [PATCH 32/44] implemented LArray.roll (needs changelog and possibly invert n) --- larray/core/array.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/larray/core/array.py b/larray/core/array.py index b2bfaa3de..0ff2fdfbd 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -6797,6 +6797,53 @@ def shift(self, axis, n=1): else: return self[:] + def roll(self, axis=None, n=1): + r"""Rolls the cells of the array n-times to the right along axis + + Parameters + ---------- + axis : int, str or Axis, optional + Axis along which to roll. Defaults to None (all axes). + n : int or LArray, optional + Number of positions to roll. Defaults to 1. Use a negative integers to roll left. + If n is an LArray the number of positions rolled can vary along the axes of n. + + Returns + ------- + LArray + + Examples + -------- + >>> arr = ndtest('sex=M,F;time=2010..2012') + >>> arr + sex\time 2010 2011 2012 + M 0 1 2 + F 3 4 5 + >>> arr.roll('time') + sex\time 2010 2011 2012 + M 2 0 1 + F 5 3 4 + >>> n = sequence(arr.sex, initial=1) + >>> n + sex M F + 1 2 + >>> arr.roll('time', n) + sex\time 2010 2011 2012 + M 2 0 1 + F 4 5 3 + """ + if isinstance(n, (int, np.integer)): + axis_idx = None if axis is None else self.axes.index(axis) + return LArray(np.roll(self.data, n, axis=axis_idx), self.axes) + else: + if not isinstance(n, LArray): + raise TypeError("n should either be an integer or an LArray") + if axis is None: + raise TypeError("axis may not be None if n is an LArray") + axis = self.axes[axis] + seq = sequence(axis) + return self[axis.i[(seq - n) % len(axis)]] + # TODO: add support for groups as axis (like aggregates) # eg a.diff(x.year[2018:]) instead of a[2018:].diff(x.year) def diff(self, axis=-1, d=1, n=1, label='upper'): From 1668e92f62a544c832d9ddbe55b476159ffde085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 21 Sep 2018 16:23:42 +0200 Subject: [PATCH 33/44] implemented AxisCollection.iter_labels --- larray/core/axis.py | 70 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/larray/core/axis.py b/larray/core/axis.py index 7ce0ae1d4..7192a5454 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -16,7 +16,7 @@ _range_to_slice, _seq_group_to_name, _translate_group_key_hdf, remove_nested_groups) from larray.util.oset import * from larray.util.misc import (basestring, PY2, unicode, long, duplicates, array_lookup2, ReprString, index_by_id, - renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi) + renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi, Product) class Axis(ABCAxis): @@ -1454,6 +1454,74 @@ def __dir__(self): def __iter__(self): return iter(self._list) + # TODO: move a few doctests to unit tests + def iter_labels(self, axes=None, ascending=True): + r"""Returns a view of the axes labels. + + Parameters + ---------- + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (all axes in the order they are + in the collection). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + + Returns + ------- + Sequence + An object you can iterate (loop) on and index by position. The precise type of which is considered an + implementation detail and should not be relied on. + + Examples + -------- + + >>> from larray import ndtest + >>> axes = ndtest((2, 2)).axes + >>> axes + AxisCollection([ + Axis(['a0', 'a1'], 'a'), + Axis(['b0', 'b1'], 'b') + ]) + >>> axes.iter_labels()[0] + (a.i[0], b.i[0]) + >>> for index in axes.iter_labels(): + ... print(index) + (a.i[0], b.i[0]) + (a.i[0], b.i[1]) + (a.i[1], b.i[0]) + (a.i[1], b.i[1]) + >>> axes.iter_labels(ascending=False)[0] + (a.i[1], b.i[1]) + >>> for index in axes.iter_labels(ascending=False): + ... print(index) + (a.i[1], b.i[1]) + (a.i[1], b.i[0]) + (a.i[0], b.i[1]) + (a.i[0], b.i[0]) + >>> axes.iter_labels(('b', 'a'))[0] + (b.i[0], a.i[0]) + >>> for index in axes.iter_labels(('b', 'a')): + ... print(index) + (b.i[0], a.i[0]) + (b.i[0], a.i[1]) + (b.i[1], a.i[0]) + (b.i[1], a.i[1]) + >>> axes.iter_labels('b')[0] + (b.i[0],) + >>> for index in axes.iter_labels('b'): + ... print(index) + (b.i[0],) + (b.i[1],) + """ + axes = self if axes is None else self[axes] + if not isinstance(axes, AxisCollection): + axes = (axes,) + # we need .i because Product uses len and [] on axes and not iter; and [] creates LGroup and not IGroup + p = Product([axis.i for axis in axes]) + if not ascending: + p = p[::-1] + return p + def __getattr__(self, key): try: return self._map[key] From 20d1b70cdf3b43c52a4fe22ea22a7be8d839c78c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 28 Nov 2018 17:54:40 +0100 Subject: [PATCH 34/44] implemented LArray.keys, LArray.values and LArray.items --- doc/source/api.rst | 13 ++ doc/source/changes/version_0_30.rst.inc | 63 +++++- larray/core/array.py | 278 +++++++++++++++++++++++- 3 files changed, 352 insertions(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 44512920a..0fe2384ff 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -230,6 +230,7 @@ LArray * :ref:`la_sorting` * :ref:`la_reshaping` * :ref:`la_testing` +* :ref:`la_iter` * :ref:`la_op` * :ref:`la_misc` * :ref:`la_to_pandas` @@ -422,6 +423,18 @@ Testing/Searching LArray.labelofmax LArray.indexofmax +.. _la_iter: + +Iterating +--------- + +.. autosummary:: + :toctree: _generated/ + + LArray.keys + LArray.values + LArray.items + .. _la_op: Operators diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc index e550fba5a..22d2c26dc 100644 --- a/doc/source/changes/version_0_30.rst.inc +++ b/doc/source/changes/version_0_30.rst.inc @@ -134,8 +134,69 @@ New features a_b a0_b1 a1_b2 1 5 -* added a feature (see the :ref:`miscellaneous section ` for details). +* implemented :py:obj:`LArray.keys()` :py:obj:`LArray.values()` and :py:obj:`LArray.items()` + methods to iterate (loop) on an array labels (keys), values or (key, value) pairs. + >>> arr = ndtest((2, 2)) + >>> arr + a\b b0 b1 + a0 0 1 + a1 2 3 + + By default they iterates on all axes, in the order they are in the array. + + >>> for value in arr.values(): + ... print(value) + 0 + 1 + 2 + 3 + >>> for key in arr.keys(): + ... print(key) + (a.i[0], b.i[0]) + (a.i[0], b.i[1]) + (a.i[1], b.i[0]) + (a.i[1], b.i[1]) + >>> for key, value in arr.items(): + ... print(key, "->", value) + (a.i[0], b.i[0]) -> 0 + (a.i[0], b.i[1]) -> 1 + (a.i[1], b.i[0]) -> 2 + (a.i[1], b.i[1]) -> 3 + + You can iterate in reverse order. + + >>> for value in arr.values(ascending=False): + ... print(value) + 3 + 2 + 1 + 0 + + or specify another axis order: + + >>> for value in arr.values(('b', 'a')): + ... print(value) + 0 + 2 + 1 + 3 + + You can also iterate on less axes than the array has. + + >>> # iterate on the "b" axis, that is, return each label along the "b" axis in turn + >>> for key in arr.keys('b'): + ... print(key) + (b.i[0],) + (b.i[1],) + >>> # iterate on the array values along the "b" axis. + >>> # That is, for each label along the "b" axis, return the corresponding (sub)array + >>> for value in arr.values('b'): + ... print(value) + a a0 a1 + 0 2 + a a0 a1 + 1 3 * implemented :py:obj:`Axis.apply()` method to transform an axis labels by a function and return a new Axis. diff --git a/larray/core/array.py b/larray/core/array.py index 0ff2fdfbd..ab91c4685 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -64,7 +64,7 @@ from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates, float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type, - renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute) + renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip) def all(values, axis=None): @@ -3100,6 +3100,282 @@ def indicesofsorted(self, axis=None, ascending=True, kind='quicksort'): posargsort = renamed_to(indicesofsorted, 'posargsort') + # TODO: move some doctests to unit tests + # TODO: implement keys_by + # TODO: implement expand=True + def keys(self, axes=None, ascending=True): + r"""Returns a view on the array labels along axes. + + This is an object you can iterate (loop) on or index to get to the Nth label along axes. + + Parameters + ---------- + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (all axes in the order they are + in the array). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + + Returns + ------- + Sequence + An object you can iterate (loop) on and index by position to get the Nth label along axes. + + Examples + -------- + >>> arr = ndtest((2, 2)) + >>> arr + a\b b0 b1 + a0 0 1 + a1 2 3 + + By default it iterates on all axes, in the order they are in the array. + + >>> arr.keys()[0] + (a.i[0], b.i[0]) + >>> for key in arr.keys(): + ... print(key) + (a.i[0], b.i[0]) + (a.i[0], b.i[1]) + (a.i[1], b.i[0]) + (a.i[1], b.i[1]) + >>> arr.keys(ascending=False)[0] + (a.i[1], b.i[1]) + >>> for key in arr.keys(ascending=False): + ... print(key) + (a.i[1], b.i[1]) + (a.i[1], b.i[0]) + (a.i[0], b.i[1]) + (a.i[0], b.i[0]) + + but you can specify another axis order: + + >>> arr.keys(('b', 'a'))[0] + (b.i[0], a.i[0]) + >>> for key in arr.keys(('b', 'a')): + ... print(key) + (b.i[0], a.i[0]) + (b.i[0], a.i[1]) + (b.i[1], a.i[0]) + (b.i[1], a.i[1]) + + One can specify less axes than the array has: + + >>> arr.keys('b')[0] + (b.i[0],) + >>> # iterate on the "b" axis, that is return each label along the "b" axis + ... for key in arr.keys('b'): + ... print(key) + (b.i[0],) + (b.i[1],) + """ + return self.axes.iter_labels(axes, ascending=ascending) + + # TODO: move many doctests to unit tests + # TODO: implement values_by + def values(self, axes=None, ascending=True, expand=False): + r"""Returns a view on the values of the array along axes. + + Parameters + ---------- + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (all axes in the order they are + in the array). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + expand : bool, optional + Whether or not to expand array using axes. This allows one to iterate on axes which do not exist in + the array, which is useful when iterating on several arrays with different axes. Defaults to False. + + Returns + ------- + Sequence + An object you can iterate (loop) on and index by position. The precise type of which is considered an + implementation detail and should not be relied on. + + Examples + -------- + >>> arr = ndtest((2, 2)) + >>> arr + a\b b0 b1 + a0 0 1 + a1 2 3 + + By default it iterates on all axes, in the order they are in the array. + + >>> for value in arr.values(): + ... print(value) + 0 + 1 + 2 + 3 + >>> arr.values()[0] + 0 + >>> arr.values(ascending=False)[0] + 3 + >>> arr.values()[-1] + 3 + >>> arr.values(('b', 'a'))[1] + 2 + >>> arr.values('b')[0] + a a0 a1 + 0 2 + >>> arr.values('b', ascending=False)[0] + a a0 a1 + 1 3 + >>> arr[arr.b.i[-1]] + a a0 a1 + 1 3 + >>> arr['b.i[-1]'] + a a0 a1 + 1 3 + >>> arr.values('b')[-1] + a a0 a1 + 1 3 + >>> for value in arr.values(ascending=False): + ... print(value) + 3 + 2 + 1 + 0 + + but you can specify another axis order: + + >>> for value in arr.values(('b', 'a')): + ... print(value) + 0 + 2 + 1 + 3 + + When you specify less axes than the array has, you get arrays back: + + >>> # iterate on the "b" axis, that is return the (sub)array for each label along the "b" axis + ... for value in arr.values('b'): + ... print(value) + a a0 a1 + 0 2 + a a0 a1 + 1 3 + >>> # iterate on the "c" axis, which does not exist in arr, that is return arr for each label along the "c" axis + ... for value in arr.values('c=c0,c1', expand=True): + ... print(value) + a\b b0 b1 + a0 0 1 + a1 2 3 + a\b b0 b1 + a0 0 1 + a1 2 3 + >>> # iterate on the "b" axis, that is return the (sub)array for each label along the "b" axis + ... for value in arr.values('b', ascending=False): + ... print(value) + a a0 a1 + 1 3 + a a0 a1 + 0 2 + """ + if axes is None: + combined = np.ravel(self.data) + return combined if ascending else combined[::-1] + + if not isinstance(axes, (tuple, AxisCollection)): + axes = (axes,) + + def get_axis(a): + if isinstance(a, basestring): + return Axis(a) if '=' in a else self.axes[a] + elif isinstance(a, int): + return self.axes[a] + else: + assert isinstance(a, Axis) + return a + axes = [get_axis(a) for a in axes] + array = self.expand(axes, readonly=True) if expand else self + axes = array.axes[axes] + # move axes in front + transposed = array.transpose(axes) + # combine axes if necessary + combined = transposed.combine_axes(axes, wildcard=True) if len(axes) > 1 else transposed + # trailing .i is to support the case where axis < self.axes (ie the elements of the result are arrays) + return combined.i if ascending else combined.i[::-1].i + + # TODO: move some doctests to unit tests + # TODO: we currently return a tuple of groups even for 1D arrays, which can be both a bad or a good thing. + # if we returned an NDGroup in all cases, it would solve the problem + # TODO: implement expand=True + def items(self, axes=None, ascending=True): + r"""Returns a (label, value) view of the array along axes. + + This is an object you can iterate (loop) on or index to get to (label, value) couples along axes. + + Parameters + ---------- + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (all axes in the order they are + in the array). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + + Returns + ------- + Sequence + An object you can iterate (loop) on and index by position to get the Nth (label, value) couple along axes. + + Examples + -------- + >>> arr = ndtest((2, 2)) + >>> arr + a\b b0 b1 + a0 0 1 + a1 2 3 + + By default it iterates on all axes, in the order they are in the array. + + >>> arr.items()[0] + ((a.i[0], b.i[0]), 0) + >>> for key, value in arr.items(): + ... print(key, "->", value) + (a.i[0], b.i[0]) -> 0 + (a.i[0], b.i[1]) -> 1 + (a.i[1], b.i[0]) -> 2 + (a.i[1], b.i[1]) -> 3 + >>> arr.items(ascending=False)[0] + ((a.i[1], b.i[1]), 3) + >>> for key, value in arr.items(ascending=False): + ... print(key, "->", value) + (a.i[1], b.i[1]) -> 3 + (a.i[1], b.i[0]) -> 2 + (a.i[0], b.i[1]) -> 1 + (a.i[0], b.i[0]) -> 0 + + but you can specify another axis order: + + >>> arr.items(('b', 'a'))[0] + ((b.i[0], a.i[0]), 0) + >>> for key, value in arr.items(('b', 'a')): + ... print(key, "->", value) + (b.i[0], a.i[0]) -> 0 + (b.i[0], a.i[1]) -> 2 + (b.i[1], a.i[0]) -> 1 + (b.i[1], a.i[1]) -> 3 + + When you specify less axes than the array has, you get arrays back: + + >>> arr.items('b')[0] + ((b.i[0],), a a0 a1 + 0 2) + >>> # iterate on the "b" axis, that is return the (sub)array for each label along the "b" axis + ... for key, value in arr.items('b'): + ... print(key, value, sep="\n") + (b.i[0],) + a a0 a1 + 0 2 + (b.i[1],) + a a0 a1 + 1 3 + """ + return SequenceZip((self.keys(axes, ascending=ascending), self.values(axes, ascending=ascending))) + def copy(self): """Returns a copy of the array. """ From 35dadc43e50cdb915a1089f4247f4501433af05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 27 Nov 2018 10:45:31 +0100 Subject: [PATCH 35/44] implemented LArray.flat --- larray/core/array.py | 49 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/larray/core/array.py b/larray/core/array.py index ab91c4685..559761cbd 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -386,6 +386,26 @@ def __setitem__(self, key, value): self.array.__setitem__(self._prepare_key(key, wildcard=True), value, translate_key=False) +class LArrayFlatIndexer(object): + __slots__ = ('array',) + + def __init__(self, array): + self.array = array + + def __getitem__(self, flat_key, sep='_'): + axes = self.array.axes + key = np.unravel_index(flat_key, axes.shape) + la_key = axes._adv_keys_to_combined_axis_la_keys(key, sep=sep) + return self.array.__getitem__(la_key, translate_key=False) + + def __setitem__(self, flat_key, value): + # np.ndarray.flat is a flatiter object but it is indexable despite the name + self.array.data.flat[flat_key] = value + + def __len__(self): + return self.array.size + + # TODO: rename to LArrayIndexPointsIndexer or something like that class LArrayPositionalPointsIndexer(object): __slots__ = ('array',) @@ -3376,6 +3396,35 @@ def items(self, axes=None, ascending=True): """ return SequenceZip((self.keys(axes, ascending=ascending), self.values(axes, ascending=ascending))) + # XXX: rename to iflat instead? + @lazy_attribute + def flat(self): + r"""Access the array by index as if it was flattened (all its axes were combined) + + Examples + -------- + >>> arr = ndtest((2, 3)) * 10 + >>> arr + a\b b0 b1 b2 + a0 0 10 20 + a1 30 40 50 + + To select the first, second, fourth and fifth values across all axes: + + >>> arr.flat[[0, 1, 3, 4]] + a_b a0_b0 a0_b1 a1_b0 a1_b1 + 0 10 30 40 + + Set the first and sixth values to 42 + + >>> arr.flat[[0, 5]] = 42 + >>> arr + a\b b0 b1 b2 + a0 42 10 20 + a1 30 40 42 + """ + return LArrayFlatIndexer(self) + def copy(self): """Returns a copy of the array. """ From 0c9dc01118467b1de389f5f9d1cb3389345526b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 27 Nov 2018 10:45:46 +0100 Subject: [PATCH 36/44] implemented LArray.unique --- doc/source/api.rst | 1 + doc/source/changes/version_0_30.rst.inc | 34 +++++++++ larray/core/array.py | 94 +++++++++++++++++++++++++ larray/tests/test_array.py | 16 +++++ 4 files changed, 145 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index 0fe2384ff..fbb7ba153 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -456,6 +456,7 @@ Miscellaneous LArray.clip LArray.shift LArray.diff + LArray.unique LArray.to_clipboard .. _la_to_pandas: diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc index 22d2c26dc..a1b00de1b 100644 --- a/doc/source/changes/version_0_30.rst.inc +++ b/doc/source/changes/version_0_30.rst.inc @@ -134,6 +134,40 @@ New features a_b a0_b1 a1_b2 1 5 +* implemented :py:obj:`LArray.unique()` method to compute unique values for an array, optionally + along axes. + + >>> arr = LArray([[0, 2, 0, 0], + ... [1, 1, 1, 0]], 'a=a0,a1;b=b0..b3') + >>> arr + a\b b0 b1 b2 b3 + a0 0 2 0 0 + a1 1 1 1 0 + + By default unique() returns the first occurrence of each unique value in the order it appears: + + >>> arr.unique() + a_b a0_b0 a0_b1 a1_b0 + 0 2 1 + + To sort the unique values, use the sort argument: + + >>> arr.unique(sort=True) + a_b a0_b0 a1_b0 a0_b1 + 0 1 2 + + One can also compute unique sub-arrays (i.e. combination of values) along axes. In our example the a0=0, a1=1 + combination appears twice along the 'b' axis, so 'b2' is not returned: + + >>> arr.unique('b') + a\b b0 b1 b3 + a0 0 2 0 + a1 1 1 0 + >>> arr.unique('b', sort=True) + a\b b3 b0 b1 + a0 0 0 2 + a1 0 1 1 + * implemented :py:obj:`LArray.keys()` :py:obj:`LArray.values()` and :py:obj:`LArray.items()` methods to iterate (loop) on an array labels (keys), values or (key, value) pairs. diff --git a/larray/core/array.py b/larray/core/array.py index 559761cbd..c9f293cf3 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -3430,6 +3430,100 @@ def copy(self): """ return LArray(self.data.copy(), axes=self.axes[:], meta=self.meta) + # XXX: we might want to implement this using .groupby().first() + def unique(self, axes=None, sort=False, sep='_'): + r"""Returns unique values (optionally along axes) + + Parameters + ---------- + axes : axis reference (int, str, Axis) or sequence of them, optional + Axis or axes along which to compute unique values. Defaults to None (all axes). + sort : bool, optional + Whether or not to sort unique values. Defaults to False. Sorting is not implemented yet for unique() along + multiple axes. + sep : str, optional + Separator when several labels need to be combined. Defaults to '_'. + + Returns + ------- + LArray + array with unique values + + Examples + -------- + >>> arr = LArray([[0, 2, 0, 0], + ... [1, 1, 1, 0]], 'a=a0,a1;b=b0..b3') + >>> arr + a\b b0 b1 b2 b3 + a0 0 2 0 0 + a1 1 1 1 0 + + By default unique() returns the first occurrence of each unique value in the order it appears: + + >>> arr.unique() + a_b a0_b0 a0_b1 a1_b0 + 0 2 1 + + To sort the unique values, use the sort argument: + + >>> arr.unique(sort=True) + a_b a0_b0 a1_b0 a0_b1 + 0 1 2 + + One can also compute unique sub-arrays (i.e. combination of values) along axes. In our example the a0=0, a1=1 + combination appears twice along the 'b' axis, so 'b2' is not returned: + + >>> arr.unique('b') + a\b b0 b1 b3 + a0 0 2 0 + a1 1 1 0 + >>> arr.unique('b', sort=True) + a\b b3 b0 b1 + a0 0 0 2 + a1 0 1 1 + """ + if axes is not None: + axes = self.axes[axes] + + assert axes is None or isinstance(axes, (Axis, AxisCollection)) + + if not isinstance(axes, AxisCollection): + axis_idx = self.axes.index(axes) if axes is not None else None + # axis needs np >= 1.13 + _, unq_index = np.unique(self, axis=axis_idx, return_index=True) + if not sort: + unq_index = np.sort(unq_index) + if axes is None: + return self.flat.__getitem__(unq_index, sep=sep) + else: + return self[axes.i[unq_index]] + else: + if sort: + raise NotImplementedError('sort=True is not implemented for unique along multiple axes') + unq_list = [] + seen = set() + list_append = unq_list.append + seen_add = seen.add + sep_join = sep.join + axis_name = sep_join(a.name for a in axes) + first_axis_idx = self.axes.index(axes[0]) + # XXX: use combine_axes(axes).items() instead? + for labels, value in self.items(axes): + hashable_value = value.data.tobytes() if isinstance(value, LArray) else value + if hashable_value not in seen: + list_append((sep_join(str(l) for l in labels), value)) + seen_add(hashable_value) + res_arr = stack(unq_list, axis_name) + # transpose the combined axis at the position where the first of the combined axes was + # TODO: use res_arr.transpose(res_arr.axes.move_axis(-1, first_axis_idx)) once #564 is implemented: + # https://github.com/larray-project/larray/issues/564 + # stack adds the stacked axes at the end + combined_axis = res_arr.axes[-1] + assert combined_axis.name == axis_name + new_axes_order = res_arr.axes - combined_axis + new_axes_order.insert(first_axis_idx, combined_axis) + return res_arr.transpose(new_axes_order) + @property def info(self): """Describes a LArray (metadata + shape and labels for each axis). diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index c54497c09..6026cffba 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -2849,6 +2849,22 @@ def test_shift_axis(small_array): l2.axes.lipro.labels = lipro.labels[1:] +def test_unique(): + arr = LArray([[[0, 2, 0, 0], + [1, 1, 1, 0]], + [[0, 2, 0, 0], + [2, 1, 2, 0]]], 'a=a0,a1;b=b0,b1;c=c0..c3') + assert_array_equal(arr.unique('a'), arr) + assert_array_equal(arr.unique('b'), arr) + assert_array_equal(arr.unique('c'), arr['c0,c1,c3']) + expected = from_string("""\ +a_b\\c c0 c1 c2 c3 +a0_b0 0 2 0 0 +a0_b1 1 1 1 0 +a1_b1 2 1 2 0""") + assert_array_equal(arr.unique(('a', 'b')), expected) + + def test_extend(small_array): sex, lipro = small_array.axes From 53d1581c7720b80d3efde9cde42ad5783d5e0922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Tue, 25 Sep 2018 09:20:14 +0200 Subject: [PATCH 37/44] WIP: generalized stack to more than one dimension (needs tests & finish the changelog) works for both stack([(ndkey, value), ...], axis=axes) and stack({ndkey: value}, several_axes) TODO: we might want to implement nested dicts before advertising this! --- doc/source/changes/version_0_30.rst.inc | 19 +++ larray/core/array.py | 200 +++++++++++++++++++----- larray/core/axis.py | 64 +++++++- 3 files changed, 244 insertions(+), 39 deletions(-) diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc index a1b00de1b..e6477ee95 100644 --- a/doc/source/changes/version_0_30.rst.inc +++ b/doc/source/changes/version_0_30.rst.inc @@ -287,6 +287,25 @@ Miscellaneous improvements A0 0 1 A1 2 3 +* py:obj:`stack()` can now stack along several axes at once. + + >>> gender = Axis('gender=M,F') + >>> country = Axis('country=BE,FR,DE') + >>> stack({('BE', 'M'): 0, + ... ('FR', 'F'): 2, + ... ('BE', 'F'): 2, + ... ('FR', 'M'): 2, + ... ('DE', 'M'): 2, + ... ('DE', 'F'): 2}, + ... (gender, country)) + FIXME: this is not correct + sex nat\test T1 T2 + M BE 1.0 0.0 + M FO 0.0 1.0 + F BE 1.0 0.0 + F FO 0.0 1.0 + + * added option ``exact`` to ``join`` argument of :py:obj:`Axis.align()` and :py:obj:`LArray.align()` methods. Instead of aligning, passing ``join='exact'`` to the ``align`` method will raise an error when axes are not equal. Closes :issue:`338`. diff --git a/larray/core/array.py b/larray/core/array.py index c9f293cf3..02253d274 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -8,8 +8,9 @@ # ? implement multi group in one axis getitem: lipro['P01,P02;P05'] <=> (lipro['P01,P02'], lipro['P05']) # * we need an API to get to the "next" label. Sometimes, we want to use label+1, but that is problematic when labels -# are not numeric, or have not a step of 1. X.agegroup[X.agegroup.after(25):] -# X.agegroup[X.agegroup[25].next():] +# are not numeric, or have not a step of 1. +# X.agegroup[X.agegroup.after(25):] +# X.agegroup[X.agegroup[25].next():] # * implement keepaxes=True for _group_aggregate instead of/in addition to group tuples @@ -25,10 +26,7 @@ # * test structured arrays -# ? move "utils" to its own project (so that it is not duplicated between larray and liam2) -# OR -# include utils only in larray project and make larray a dependency of liam2 -# (and potentially rename it to reflect the broader scope) +# * use larray "utils" in LIAM2 (to avoid duplicated code) from collections import Iterable, Sequence, OrderedDict from itertools import product, chain, groupby, islice @@ -8656,7 +8654,69 @@ def eye(rows, columns=None, k=0, title=None, dtype=None, meta=None): # ('FR', 'M'): 2, ('FR', 'F'): 3, # ('DE', 'M'): 4, ('DE', 'F'): 5}) +# for 2D, I think the best compromise is the nested dict (especially for python 3.7+): +# stack({'BE': {'M': 0, 'F': 1}, +# 'FR': {'M': 2, 'F': 3}, +# 'DE': {'M': 4, 'F': 5}}, axes=('nationality', 'sex')) + +# we could make this valid too (combine pos and labels) but I don't think it worth it unless it comes +# naturally from the implementation: + +# stack({'BE': {'M,F': [0, 1]}, +# 'FR': {'M,F': [2, 3]}, +# 'DE': {'M,F': [4, 5]}}, axes=('nationality', 'sex')) + +# It looks especially nice if the labels have been extracted to variables: + +# BE, FR, DE = nat['BE,FR,DE'] +# M, F = sex['M,F'] + +# stack({BE: {M: 0, F: 1}, +# FR: {M: 2, F: 3}, +# DE: {M: 4, F: 5}}) + +# for 3D: + +# stack({'a0': {'b0': {'c0': 0, 'c1': 1}, +# 'b1': {'c0': 2, 'c1': 3}, +# 'b2': {'c0': 4, 'c1': 5}}, +# 'a1': {'b0': {'c0': 6, 'c1': 7}, +# 'b1': {'c0': 8, 'c1': 9}, +# 'b2': {'c0': 10, 'c1': 11}}}, +# axes=('a', 'b', 'c')) + +# a0, a1 = a['a0,a1'] +# b0, b1, b2 = b['b0,b1,b2'] +# c0, c1 = c['c0,c1'] + +# stack({a0: {b0: {c0: 0, c1: 1}, +# b1: {c0: 2, c1: 3}, +# b2: {c0: 4, c1: 5}}, +# a1: {b0: {c0: 6, c1: 7}, +# b1: {c0: 8, c1: 9}, +# b2: {c0: 10, c1: 11}}}, +# axes=(a, b, c)) + +# if we implement: +# arr[key] = {'a0': 0, 'a1': 1} +# where key must not be related to the "a" axis +# if would make it relatively easy to implement the nested dict syntax I think: +# first do a pass at the structure to get axes (if not provided) then: +# for k, v in d.items(): +# arr[k] = v +# but that syntax could be annoying if we want to have an array of dicts + +# alternatives: + +# arr['a0'] = 0; arr['a1'] = 1 # <-- this already works +# arr['a0,a1'] = [0, 1] # <-- unsure if this works, but we should make it work (it is annoying if we +# # have an array of lists +# arr[:] = {'a0': 0, 'a1': 1} +# arr[:] = stack({'a0': 0, 'a1': 1}) # <-- not equivalent if a has more labels + + +# TODO: rename axis to axes (with deprecation) def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs): r""" Combines several arrays or sessions along an axis. @@ -8670,8 +8730,8 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) Stacking sessions will return a new session containing the arrays of all sessions stacked together. An array missing in a session will be replaced by NaN. - axis : str or Axis or Group, optional - Axis to create. If None, defaults to a range() axis. + axis : str or Axis or Group or tuple/AxisCollection of Axis, optional + Axes to create. If None, defaults to a range() axis. title : str, optional Deprecated. See 'meta' below. meta : list of pairs or dict or OrderedDict or Metadata, optional @@ -8704,6 +8764,8 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) sex\nat BE FO M 1.0 0.0 F 1.0 0.0 + + >>> # TODO: move this to a unit test >>> all_nat = Axis('nat=BE,DE,FR,NL,UK') >>> stack({'BE': arr1, 'DE': arr2}, all_nat[:'DE']) sex\nat BE DE @@ -8756,6 +8818,20 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) M 1.0 0.0 F 1.0 0.0 + Stack can also stack along several axes + + >>> test = Axis('test=T1,T2') + >>> stack({('BE', 'T1'): arr1, + ... ('BE', 'T2'): arr2, + ... ('FO', 'T1'): arr2, + ... ('FO', 'T2'): arr1}, + ... (nat, test)) + sex nat\test T1 T2 + M BE 1.0 0.0 + M FO 0.0 1.0 + F BE 1.0 0.0 + F FO 0.0 1.0 + To stack sessions, let us first create two test sessions. For example suppose we have a session storing the results of a baseline simulation: @@ -8796,67 +8872,115 @@ def stack(elements=None, axis=None, title=None, meta=None, dtype=None, **kwargs) elif kwargs: raise TypeError("stack() accept either keyword arguments OR a collection of elements, not both") - if isinstance(axis, Axis) and all(isinstance(e, tuple) for e in elements): - assert all(len(e) == 2 for e in elements) - elements = {k: v for k, v in elements} - if isinstance(elements, LArray): if axis is None: axis = -1 axis = elements.axes[axis] - values = [elements[k] for k in axis] + items = elements.items(axis) elif isinstance(elements, dict): + axis_tuple = isinstance(axis, tuple) and all(isinstance(a, Axis) for a in axis) + axis_seq = isinstance(axis, AxisCollection) or axis_tuple # TODO: support having no Axis object for Python3.7 (without error or warning) # XXX: we probably want to support this with a warning on Python < 3.7 - assert isinstance(axis, Axis) - values = [elements[v] for v in axis.labels] + assert isinstance(axis, Axis) or axis_seq + if not isinstance(axis, AxisCollection): + axis = AxisCollection(axis) + + # this assumes we support non complete axes + # items = [(axis.to_igroup(k), v) for k, v in elements.items()] + + # translate elements keys to a group or tuple of groups so that they are compatible with + # what iter_labels gives us + # FIXME: we must also reorder translated keys otherwise if axis order is different it does not match + # e.g. + # >>> gender = Axis('gender=M,F') + # >>> country = Axis('country=BE,FR,DE') + # >>> stack({('BE', 'M'): 0, + # ... ('FR', 'F'): 2, + # ... ('BE', 'F'): 2, + # ... ('FR', 'M'): 2, + # ... ('DE', 'M'): 2, + # ... ('DE', 'F'): 2}, + # ... (gender, country)) + # solution: in to_igroup, go via dict then tuple of igroups or slice(None) + elements = {axis.to_igroup(k): v for k, v in elements.items()} + items = [(k, elements[k]) for k in axis.iter_labels()] elif isinstance(elements, Iterable): if not isinstance(elements, Sequence): elements = list(elements) if all(isinstance(e, tuple) for e in elements): assert all(len(e) == 2 for e in elements) - keys = [k for k, v in elements] - values = [v for k, v in elements] - assert all(np.isscalar(k) for k in keys) - # this case should already be handled - assert not isinstance(axis, Axis) - # axis should be None or str - axis = Axis(keys, axis) + items = elements + if axis is None or isinstance(axis, basestring): + keys = [k for k, v in elements] + # assert that all keys are indexers + assert all(np.isscalar(k) or isinstance(k, (Group, tuple)) for k in keys) + axis = Axis(keys, axis) else: - values = elements if axis is None or isinstance(axis, basestring): axis = Axis(len(elements), axis) else: assert len(axis) == len(elements) + items = list(zip(axis, elements)) else: raise TypeError('unsupported type for arrays: %s' % type(elements).__name__) - if any(isinstance(v, Session) for v in values): - sessions = values - if not all(isinstance(s, Session) for s in sessions): + if any(isinstance(v, Session) for k, v in items): + if not all(isinstance(v, Session) for k, v in items): raise TypeError("stack() only supports stacking Session with other Session objects") - all_keys = unique_multi(s.keys() for s in sessions) - res = [] - for name in all_keys: + array_names = unique_multi(sess.keys() for sess_name, sess in items) + + def stack_one(array_name): try: - stacked = stack([s.get(name, nan) for s in sessions], axis=axis) + return stack([(sess_name, sess.get(array_name, nan)) + for sess_name, sess in items], axis=axis) # TypeError for str arrays, ValueError for incompatible axes, ... except Exception: - stacked = nan - res.append((name, stacked)) - return Session(res, meta=meta) + return nan + + return Session([(name, stack_one(name)) for name in array_names], meta=meta) else: - # XXX : use concat? values = [aslarray(v) if not np.isscalar(v) else v - for v in values] - result_axes = AxisCollection.union(*[get_axes(v) for v in values]) - result_axes.append(axis) + for k, v in items] + + # we need a kludge to support stacking along an anonymous axis because AxisCollection.extend (and thus + # AxisCollection.union support for anonymous axes is kinda messy). + if isinstance(axis, Axis) and axis.name is None: + axis = axis.rename('__anonymous__') + kludge = True + else: + kludge = False + + # XXX: with the current semantics of stack, we need to compute the union of axes for values but axis + # needs to be added unconditionally. We *might* want to change the semantics to mean either stack or + # concat depending on whether or not the axis already exists. + # this would be more convenient for users I think, but would mean one class of error we cannot detect + # anymore: if a user unintentionally stacks an array with the axis already present. + # (this is very similar to the debate about combining LArray.append and LArray.extend) + all_axes = [get_axes(v) for v in values] + [axis] + result_axes = AxisCollection.union(*all_axes) + if kludge: + # TODO: use AxisCollection.rename when it will exist + result_axes = result_axes.replace(axis, axis.rename(None)) + if dtype is None: dtype = common_type(values) + + # XXX: if we want to support partial axes, we need to use full with a fillvalue + # but only if not entirely filled. How do we check that efficiently? + # *assuming* axes do not contain duplicate labels, we could check that + # len(unique(keys)) == result_axes.size but that would be expensive + # Note that we can translate the keys first then check if it fills the whole array + # as it will be faster to compare ints than strings. + # a quick check would be len(items) == result_axes.size but that isn't very robust result = empty(result_axes, dtype=dtype, meta=meta) - for k, v in zip(axis, values): + + # FIXME: this is *much* faster but it only works for scalars and not for stacking arrays + # keys = tuple(zip(*[k for k, v in items])) + # result.points[keys] = values + for k, v in items: result[k] = v return result diff --git a/larray/core/axis.py b/larray/core/axis.py index 7192a5454..b04fb73af 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1956,7 +1956,11 @@ def check_compatible(self, axes): if not local_axis.iscompatible(axis): raise ValueError("incompatible axes:\n{!r}\nvs\n{!r}".format(axis, local_axis)) - # TODO: deprecate method. union is enough + # XXX: deprecate method (functionality is duplicated in union)? + # I am not so sure anymore we need to actually deprecate the method: having both methods with the same + # semantic like we currently have is useless indeed but I think we should have both a set-like method (union) + # and the possibility to add an axis unconditionally (append or extend). That is, add an axis, even if that + # name already exists. This is especially important for anonymous axes (see my comments in stack for example) # TODO: deprecate validate argument (unused) # TODO: deprecate replace_wildcards argument (unused) def extend(self, axes, validate=True, replace_wildcards=False): @@ -2470,6 +2474,62 @@ def _translate_axis_key(self, axis_key, bool_passthrough=True): else: return self._translate_axis_key_chunk(axis_key, bool_passthrough) + def to_igroup(self, key): + """ + Transforms any key (from LArray.__get|setitem__) to a complete indices-based group key. + + Parameters + ---------- + key : scalar, list/array of scalars, Group or tuple or dict of them + any key supported by LArray.__get|setitem__ + + Returns + ------- + tuple of IGroup + len(tuple) == len(key) if isinstance(key, tuple) else 1 + """ + from .array import LArray + + # convert scalar keys to 1D keys + if not isinstance(key, (tuple, dict)): + key = (key,) + + # FIXME: add support for dict key + + # always the case except if key is a dict + if isinstance(key, tuple): + key = tuple(axis_key.evaluate(self) if isinstance(axis_key, ExprNode) else axis_key + for axis_key in key) + + nonboolkey = [] + for axis_key in key: + if isinstance(axis_key, np.ndarray) and np.issubdtype(axis_key.dtype, np.bool_): + if axis_key.shape != self.shape: + raise ValueError("boolean key with a different shape ({}) than array ({})" + .format(axis_key.shape, self.shape)) + axis_key = LArray(axis_key, self) + + if isinstance(axis_key, LArray) and np.issubdtype(axis_key.dtype, np.bool_): + extra_key_axes = axis_key.axes - self + if extra_key_axes: + raise ValueError("subset key contains more axes ({}) than array ({})" + .format(axis_key.axes, self)) + nonboolkey.extend(axis_key.nonzero()) + else: + nonboolkey.append(axis_key) + key = tuple(nonboolkey) + + # drop slice(None) and Ellipsis since they are meaningless because of guess_axis. + # XXX: we might want to raise an exception when we find Ellipses or (most) slice(None) because except for + # a single slice(None) a[:], I don't think there is any point. + key = [axis_key for axis_key in key + if not _isnoneslice(axis_key) and axis_key is not Ellipsis] + + # translate all keys to IGroup + return tuple(self._translate_axis_key(axis_key) for axis_key in key) + else: + raise ValueError('dict key not supported for now') + def _translated_key(self, key): """ Transforms any key (from LArray.__get|setitem__) to a complete indices-based key. @@ -2487,6 +2547,8 @@ def _translated_key(self, key): This key is not yet usable as is in a numpy array as it can still contain LArray parts and the advanced key parts are not broadcasted together yet. """ + # FIXME: use to_igroup + from .array import LArray # convert scalar keys to 1D keys From 33e215c0970998aafe12eea842f864f78b56fc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 12 Dec 2018 17:28:38 +0100 Subject: [PATCH 38/44] implemented zip_array_values and zip_array_items --- larray/core/array.py | 341 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 310 insertions(+), 31 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 02253d274..5891f9129 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -28,8 +28,8 @@ # * use larray "utils" in LIAM2 (to avoid duplicated code) -from collections import Iterable, Sequence, OrderedDict -from itertools import product, chain, groupby, islice +from collections import Iterable, Sequence, OrderedDict, abc +from itertools import product, chain, groupby, islice, repeat import os import sys import functools @@ -62,8 +62,8 @@ from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates, float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type, - renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip) - + renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip, + Repeater) def all(values, axis=None): """ @@ -3191,7 +3191,7 @@ def keys(self, axes=None, ascending=True): # TODO: move many doctests to unit tests # TODO: implement values_by - def values(self, axes=None, ascending=True, expand=False): + def values(self, axes=None, ascending=True): r"""Returns a view on the values of the array along axes. Parameters @@ -3201,9 +3201,6 @@ def values(self, axes=None, ascending=True, expand=False): in the array). ascending : bool, optional Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. - expand : bool, optional - Whether or not to expand array using axes. This allows one to iterate on axes which do not exist in - the array, which is useful when iterating on several arrays with different axes. Defaults to False. Returns ------- @@ -3275,15 +3272,6 @@ def values(self, axes=None, ascending=True, expand=False): 0 2 a a0 a1 1 3 - >>> # iterate on the "c" axis, which does not exist in arr, that is return arr for each label along the "c" axis - ... for value in arr.values('c=c0,c1', expand=True): - ... print(value) - a\b b0 b1 - a0 0 1 - a1 2 3 - a\b b0 b1 - a0 0 1 - a1 2 3 >>> # iterate on the "b" axis, that is return the (sub)array for each label along the "b" axis ... for value in arr.values('b', ascending=False): ... print(value) @@ -3294,24 +3282,15 @@ def values(self, axes=None, ascending=True, expand=False): """ if axes is None: combined = np.ravel(self.data) + # contrary to what I thought, combined[::-1] *is* indexable return combined if ascending else combined[::-1] - if not isinstance(axes, (tuple, AxisCollection)): + if not isinstance(axes, (tuple, list, AxisCollection)): axes = (axes,) - def get_axis(a): - if isinstance(a, basestring): - return Axis(a) if '=' in a else self.axes[a] - elif isinstance(a, int): - return self.axes[a] - else: - assert isinstance(a, Axis) - return a - axes = [get_axis(a) for a in axes] - array = self.expand(axes, readonly=True) if expand else self - axes = array.axes[axes] + axes = self.axes[axes] # move axes in front - transposed = array.transpose(axes) + transposed = self.transpose(axes) # combine axes if necessary combined = transposed.combine_axes(axes, wildcard=True) if len(axes) > 1 else transposed # trailing .i is to support the case where axis < self.axes (ie the elements of the result are arrays) @@ -3320,7 +3299,6 @@ def get_axis(a): # TODO: move some doctests to unit tests # TODO: we currently return a tuple of groups even for 1D arrays, which can be both a bad or a good thing. # if we returned an NDGroup in all cases, it would solve the problem - # TODO: implement expand=True def items(self, axes=None, ascending=True): r"""Returns a (label, value) view of the array along axes. @@ -7646,6 +7624,305 @@ def split_axes(self, axes=None, sep='_', names=None, regex=None, sort=False, fil return array split_axis = renamed_to(split_axes, 'split_axis') + # FIXME: implement apply_by (this might be this function) and apply + def apply(self, transform, axes=None, dtype=None, ascending=True, args=(), **kwargs): + r""" + Apply func to array elements along axes. + + Parameters + ---------- + transform : function or mapping + Function or mapping to apply to elements of the array. + The axes and dtype of all results must be the same. Functions will be called with the original value + as first argument and must return a single new value. A mapping (dict) must have the values to transform + as keys and the new values as values, that is: {: , : , ...}. + axes : str, int or Axis or tuple/list/AxisCollection of the them, optional + Axis or axes along which to operate. Defaults to None (all axes). + Using the axes argument only works with a function transform. + dtype : type, optional + Output dtype. Defaults to None (inspect all output values to infer it automatically). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + args : tuple, optional + Extra arguments to pass to the function. Defaults to (). + **kwargs + Extra keyword arguments are passed to the function (as keyword arguments). + + Returns + ------- + LArray or scalar + Axes will be the union of those in axis and those of values returned by the function. + + Notes + ----- + To apply a transformation given as an LArray (with current values as labels on one axis of + the array and desired values as the array values), you can use: ``mapping_arr[original_arr]``. + + Examples + -------- + First let us define a test array + + >>> arr = LArray([[0, 2, 1], + ... [3, 1, 5]], 'a=a0,a1;b=b0..b2') + >>> arr + a\b b0 b1 b2 + a0 0 2 1 + a1 3 1 5 + + Here is a simple function we would like to apply to each element of the array. + Note that this particular example should rather be written as: arr ** 2 + as it is both more concise and much faster. + + >>> def square(x): + ... return x ** 2 + >>> arr.apply(square) + a\b b0 b1 b2 + a0 0 4 1 + a1 9 1 25 + + Now, assuming for a moment that the values of our test array above were in fact some numeric representation of + names and we had the correspondence to the actual names stored in a dictionary: + + >>> code_to_names = {0: 'foo', 1: 'bar', 2: 'baz', + ... 3: 'boo', 4: 'far', 5: 'faz'} + + We could get back an array with the actual names by using: + + >>> arr.apply(code_to_names) + a\b b0 b1 b2 + a0 foo baz bar + a1 boo bar faz + + Functions can also be applied along some axes: + + >>> # this is equivalent to (but much slower than): arr.sum_by('a') + ... arr.apply(sum, 'a') + a a0 a1 + 3 9 + + Applying the function along some axes will return an array with the + union of those axes and the axes of the returned values. For example, + let us define a function which returns the k highest values of an array. + + >>> def topk(a, k=2): + ... return a.sort_values(ascending=False).ignore_labels().i[:k] + >>> arr.apply(topk, 'a') + a\b* 0 1 + a0 2 1 + a1 5 3 + + Other arguments can be passed to the function as a tuple in the "args" argument: + + >>> arr.apply(topk, axes='a', args=(3,)) + a\b* 0 1 2 + a0 2 1 0 + a1 5 3 1 + + or by using keyword arguments: + + >>> arr.apply(topk, axes='a', k=3) + a\b* 0 1 2 + a0 2 1 0 + a1 5 3 1 + """ + if axes is None: + if isinstance(transform, abc.Mapping): + mapping = transform + + def transform(v): + return mapping.get(v, v) + if dtype is None: + vfunc = np.vectorize(transform) + else: + vfunc = np.vectorize(transform, otypes=[dtype]) + return LArray(vfunc(self.data, *args, **kwargs), self.axes) + else: + if not callable(transform): + raise TypeError("using the 'axes' argument in LArray.apply() only works with a function 'transform'") + # this is necessary so that stack output is nice. + # XXX: when iter_labels returns NDGroups, this might not be necessary anymore + axes = self.axes[axes] + # TODO: implement res_axes argument in stack. I guess computing res_axes (by examining each value) takes a + # significant time of stack and here we can know it in advance in the usual case (ie each return value + # of func has the same axes) + values = (self,) + args + tuple(kwargs.values()) + first_kw = 1 + len(args) + kwnames = tuple(kwargs.keys()) + res_arr = stack([(k, transform(*a_and_kwa[:first_kw], **dict(zip(kwnames, a_and_kwa[first_kw:])))) + for k, a_and_kwa in zip_array_items(values, axes, ascending)], + axis=axes, dtype=dtype) + + # transpose back axis where it was + return res_arr.transpose(self.axes & res_arr.axes) + + +def zip_array_values(values, axes=None, ascending=True): + r""" + + Parameters + ---------- + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (union of all axes present in + all arrays, in the order they are found). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + + Returns + ------- + Sequence + + Examples + -------- + >>> arr1 = ndtest('a=a0,a1;b=b1,b2') + >>> arr2 = ndtest('a=a0,a1;c=c1,c2') + >>> arr1 + a\b b1 b2 + a0 0 1 + a1 2 3 + >>> arr2 + a\c c1 c2 + a0 0 1 + a1 2 3 + >>> for a1, a2 in zip_array_values((arr1, arr2), 'a'): + ... print("==") + ... print(a1) + ... print(a2) + == + b b1 b2 + 0 1 + c c1 c2 + 0 1 + == + b b1 b2 + 2 3 + c c1 c2 + 2 3 + >>> for a1, a2 in zip_array_values((arr1, arr2), arr2.c): + ... print("==") + ... print(a1) + ... print(a2) + == + a\b b1 b2 + a0 0 1 + a1 2 3 + a a0 a1 + 0 2 + == + a\b b1 b2 + a0 0 1 + a1 2 3 + a a0 a1 + 1 3 + >>> for a1, a2 in zip_array_values((arr1, arr2)): + ... print("arr1: {}, arr2: {}".format(a1, a2)) + arr1: 0, arr2: 0 + arr1: 0, arr2: 1 + arr1: 1, arr2: 0 + arr1: 1, arr2: 1 + arr1: 2, arr2: 2 + arr1: 2, arr2: 3 + arr1: 3, arr2: 2 + arr1: 3, arr2: 3 + """ + def values_with_expand(value, axes, readonly=True, ascending=True): + if isinstance(value, LArray): + # an Axis axis is not necessarily in array.axes + expanded = value.expand(axes, readonly=readonly) + return expanded.values(axes, ascending=ascending) + else: + size = axes.size if axes.ndim else 0 + return Repeater(value, size) + + all_axes = AxisCollection.union(*[get_axes(v) for v in values]) + if axes is None: + axes = all_axes + else: + if not isinstance(axes, (tuple, list, AxisCollection)): + axes = (axes,) + # transform string axes definitions to objects + axes = [Axis(axis) if isinstance(axis, basestring) and '=' in axis else axis + for axis in axes] + axes = AxisCollection([axis if isinstance(axis, Axis) else all_axes[axis] + for axis in axes]) + + # sequence of tuples (of scalar or arrays) + return SequenceZip([values_with_expand(v, axes, ascending=ascending) for v in values]) + + +def zip_array_items(values, axes=None, ascending=True): + r""" + + Parameters + ---------- + values : Iterable + arrays or values to combine. + axes : int, str or Axis or tuple of them, optional + Axis or axes along which to iterate and in which order. Defaults to None (union of all axes present in + all arrays, in the order they are found). + ascending : bool, optional + Whether or not to iterate the axes in ascending order (from start to end). Defaults to True. + + Returns + ------- + Sequence + + Examples + -------- + >>> arr1 = ndtest('a=a0,a1;b=b0,b1') + >>> arr2 = ndtest('a=a0,a1;c=c0,c1') + >>> arr1 + a\b b0 b1 + a0 0 1 + a1 2 3 + >>> arr2 + a\c c0 c1 + a0 0 1 + a1 2 3 + >>> for k, (a1, a2) in zip_array_items((arr1, arr2), 'a'): + ... print("==", k[0], "==") + ... print(a1) + ... print(a2) + == a0 == + b b0 b1 + 0 1 + c c0 c1 + 0 1 + == a1 == + b b0 b1 + 2 3 + c c0 c1 + 2 3 + >>> for k, (a1, a2) in zip_array_items((arr1, arr2), arr2.c): + ... print("==", k[0], "==") + ... print(a1) + ... print(a2) + == c0 == + a\b b0 b1 + a0 0 1 + a1 2 3 + a a0 a1 + 0 2 + == c1 == + a\b b0 b1 + a0 0 1 + a1 2 3 + a a0 a1 + 1 3 + >>> for k, (a1, a2) in zip_array_items((arr1, arr2)): + ... print(k, "arr1: {}, arr2: {}".format(a1, a2)) + (a.i[0], b.i[0], c.i[0]) arr1: 0, arr2: 0 + (a.i[0], b.i[0], c.i[1]) arr1: 0, arr2: 1 + (a.i[0], b.i[1], c.i[0]) arr1: 1, arr2: 0 + (a.i[0], b.i[1], c.i[1]) arr1: 1, arr2: 1 + (a.i[1], b.i[0], c.i[0]) arr1: 2, arr2: 2 + (a.i[1], b.i[0], c.i[1]) arr1: 2, arr2: 3 + (a.i[1], b.i[1], c.i[0]) arr1: 3, arr2: 2 + (a.i[1], b.i[1], c.i[1]) arr1: 3, arr2: 3 + """ + res_axes = AxisCollection.union(*[get_axes(v) for v in values]) + return SequenceZip((res_axes.iter_labels(axes, ascending=ascending), + zip_array_values(values, axes=axes, ascending=ascending))) + def larray_equal(a1, a2): import warnings @@ -9083,3 +9360,5 @@ def make_args_broadcastable(args, kwargs=None, min_axes=None): # - pyexcelerate: yet faster but also write only. Didn't check whether API is more featured than xlsxwriter or not. # - xlwings: wraps win32com & equivalent on mac, so can potentially do everything (I guess) but this is SLOW and needs # a running excel instance, etc. + +zip_array_values((1, 2)) From b16ef721b0df58bcc351ccc621e1b5a1208f5021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 12 Dec 2018 17:28:59 +0100 Subject: [PATCH 39/44] implemented LArray.apply --- doc/source/changes/version_0_30.rst.inc | 69 +++++++++++++++++++++++++ larray/core/array.py | 4 ++ 2 files changed, 73 insertions(+) diff --git a/doc/source/changes/version_0_30.rst.inc b/doc/source/changes/version_0_30.rst.inc index e6477ee95..63619e907 100644 --- a/doc/source/changes/version_0_30.rst.inc +++ b/doc/source/changes/version_0_30.rst.inc @@ -168,6 +168,75 @@ New features a0 0 0 2 a1 0 1 1 +* implemented :py:obj:`LArray.apply()` method to apply a python function or mapping to all + elements of an array or to all sub-arrays along some axes of an array and return the result. This is an extremely + versatile method as it can be used both with aggregating functions or element-wise functions. + + First let us define a test array + + >>> arr = LArray([[0, 2, 1], + ... [3, 1, 5]], 'a=a0,a1;b=b0..b2') + >>> arr + a\b b0 b1 b2 + a0 0 2 1 + a1 3 1 5 + + Here is a simple function we would like to apply to each element of the array. + Note that this particular example should rather be written as: arr ** 2 + as it is both more concise and much faster. + + >>> def square(x): + ... return x ** 2 + >>> arr.apply(square) + a\b b0 b1 b2 + a0 0 4 1 + a1 9 1 25 + + Now, assuming for a moment that the values of our test array above were in fact some numeric representation of + names and we had the correspondence to the actual names stored in a dictionary: + + >>> code_to_names = {0: 'foo', 1: 'bar', 2: 'baz', + ... 3: 'boo', 4: 'far', 5: 'faz'} + + We could get back an array with the actual names by using: + + >>> arr.apply(code_to_names) + a\b b0 b1 b2 + a0 foo baz bar + a1 boo bar faz + + Functions can also be applied along some axes: + + >>> # this is equivalent to (but much slower than): arr.sum_by('a') + ... arr.apply(sum, 'a') + a a0 a1 + 3 9 + + Applying the function along some axes will return an array with the + union of those axes and the axes of the returned values. For example, + let us define a function which returns the k highest values of an array. + + >>> def topk(a, k=2): + ... return a.sort_values(ascending=False).ignore_labels().i[:k] + >>> arr.apply(topk, 'a') + a\b* 0 1 + a0 2 1 + a1 5 3 + + Other arguments can be passed to the function as a tuple in the "args" argument: + + >>> arr.apply(topk, axes='a', args=(3,)) + a\b* 0 1 2 + a0 2 1 0 + a1 5 3 1 + + or by using keyword arguments: + + >>> arr.apply(topk, axes='a', k=3) + a\b* 0 1 2 + a0 2 1 0 + a1 5 3 1 + * implemented :py:obj:`LArray.keys()` :py:obj:`LArray.values()` and :py:obj:`LArray.items()` methods to iterate (loop) on an array labels (keys), values or (key, value) pairs. diff --git a/larray/core/array.py b/larray/core/array.py index 5891f9129..f8a1655e8 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -7725,6 +7725,10 @@ def apply(self, transform, axes=None, dtype=None, ascending=True, args=(), **kwa a0 2 1 0 a1 5 3 1 """ + # XXX: we could go one step further than vectorize and support a array of callables which would be broadcasted + # with the other arguments. I don't know whether that would actually help because I think it always + # possible to emulate that with a single callable with an extra argument (eg type) which dispatches to + # potentially different callables. It might be more practical & efficient though. if axes is None: if isinstance(transform, abc.Mapping): mapping = transform From d284d8abd612399c6ea9d4f518f5f4ac107ac58b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 11 Mar 2019 14:31:33 +0100 Subject: [PATCH 40/44] WIP: added axes_names argument to as_table --- larray/core/array.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index f8a1655e8..c85b60cc7 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2337,8 +2337,8 @@ def __iter__(self): def __contains__(self, key): return any(key in axis for axis in self.axes) - def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_name='value'): - """ + def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_name='value', axes_names=True): + r""" Generator. Returns next line of the table representing an array. Parameters @@ -2360,6 +2360,8 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam value_name : str, optional Name of the column containing the values (last column) when `wide=False` (see above). Defaults to 'value'. + axes_names : bool or 'except_last', optional + Whether or not to include the last axis name preceded by a '\'. Defaults to True. Returns ------- @@ -2370,13 +2372,13 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam -------- >>> arr = ndtest((2, 2, 3)) >>> list(arr.as_table()) # doctest: +NORMALIZE_WHITESPACE - [['a', 'b\\\\c', 'c0', 'c1', 'c2'], + [['a', 'b\\c', 'c0', 'c1', 'c2'], ['a0', 'b0', 0, 1, 2], ['a0', 'b1', 3, 4, 5], ['a1', 'b0', 6, 7, 8], ['a1', 'b1', 9, 10, 11]] >>> list(arr.as_table(light=True)) # doctest: +NORMALIZE_WHITESPACE - [['a', 'b\\\\c', 'c0', 'c1', 'c2'], + [['a', 'b\\c', 'c0', 'c1', 'c2'], ['a0', 'b0', 0, 1, 2], ['', 'b1', 3, 4, 5], ['a1', 'b0', 6, 7, 8], @@ -2409,13 +2411,19 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam width = 1 height = int(np.prod(self.shape)) data = np.asarray(self).reshape(height, width) + display_axes_names = axes_names # get list of names of axes axes_names = self.axes.display_names[:] # transforms ['a', 'b', 'c', 'd'] into ['a', 'b', 'c\\d'] if wide and len(axes_names) > 1: - axes_names[-2] = '\\'.join(axes_names[-2:]) - axes_names.pop() + if display_axes_names is True: + axes_names[-2] = '\\'.join(axes_names[-2:]) + axes_names.pop() + elif display_axes_names == 'except_last': + axes_names = axes_names[:-1] + else: + axes_names = [''] * (len(axes_names) - 1) axes = self.axes[:-1] if wide else self.axes # get list of labels for each axis (except the last one if wide=True) labels = [axis.labels.tolist() for axis in axes] From bf1f63c6837a5235225626aaab7884297cc69ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Mon, 11 Mar 2019 14:33:55 +0100 Subject: [PATCH 41/44] WIP: added axes_names and na_repr arguments to LArray.dump --- larray/core/array.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index c85b60cc7..f59513058 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2458,7 +2458,9 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam # returns next line (labels of N-1 first axes + data) yield list(tick) + dataline.tolist() - def dump(self, header=True, wide=True, value_name='value', light=False): + # TODO: merge with as_table + # XXX: dump as a 2D LArray with row & col dims? + def dump(self, header=True, wide=True, value_name='value', light=False, axes_names=True, na_repr='as_is'): """Dump array as a 2D nested list Parameters @@ -2475,6 +2477,13 @@ def dump(self, header=True, wide=True, value_name='value', light=False): light : bool, optional Whether or not to hide repeated labels. In other words, only show a label if it is different from the previous one. Defaults to False. + axes_names : bool or 'except_last', optional + Assuming header is True, whether or not to include axes names. Defaults to True. If axes_names is + 'except_last', all axes names will be included except the last. + last_axis : bool, optional + Whether or not to include the last axis name preceded by a '\'. Defaults to True. + na_repr : any scalar, optional + Replace missing values (NaN floats) by this value. Default to 'as_is' (do not do any replacement). Returns ------- @@ -2482,9 +2491,15 @@ def dump(self, header=True, wide=True, value_name='value', light=False): """ if not header: # flatten all dimensions except the last one - return self.data.reshape(-1, self.shape[-1]).tolist() + res2d = self.data.reshape(-1, self.shape[-1]).tolist() else: - return list(self.as_table(wide=wide, value_name=value_name, light=light)) + res2d = list(self.as_table(wide=wide, value_name=value_name, light=light, axes_names=axes_names)) + if na_repr != 'as_is': + # isnan is + res2d = [[na_repr if value != value else value + for value in line] + for line in res2d] + return res2d # XXX: should filter(geo=['W']) return a view by default? (collapse=True) # I think it would be dangerous to make it the default From 775ac3e318ef7ed195a1816d4e83ad8eabace47e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 15 Mar 2019 11:43:08 +0100 Subject: [PATCH 42/44] WIP: added important FIXMEs --- larray/core/array.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/larray/core/array.py b/larray/core/array.py index f59513058..b63d19efd 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -2425,6 +2425,7 @@ def as_table(self, maxlines=None, edgeitems=5, light=False, wide=True, value_nam else: axes_names = [''] * (len(axes_names) - 1) axes = self.axes[:-1] if wide else self.axes + # MEGA-FIXME: ensure that it will work in xlwings (see dump below) # get list of labels for each axis (except the last one if wide=True) labels = [axis.labels.tolist() for axis in axes] # creates vertical lines (ticks is a list of list) @@ -2490,7 +2491,13 @@ def dump(self, header=True, wide=True, value_name='value', light=False, axes_nam 2D nested list """ if not header: + # MEGA-FIXME: ensure that either + # * we have no numpy types left here (this can be the case with tolist if we have a numpy array with + # object dtype with numpy types in some of its cells (this is the 65535 dc2019 bug)!) + # * xlwings accepts those + # Unsure where this should be fixed. In np.array.tolist, in xlwings or in larray. # flatten all dimensions except the last one + # same fix should be applies in as_table above (it uses tolist too) res2d = self.data.reshape(-1, self.shape[-1]).tolist() else: res2d = list(self.as_table(wide=wide, value_name=value_name, light=light, axes_names=axes_names)) From 28be4952b4e071561ef076160794336f5e09db0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 15 Mar 2019 11:43:38 +0100 Subject: [PATCH 43/44] WIP: added a few XXX in Excel handling code --- larray/inout/xw_excel.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/larray/inout/xw_excel.py b/larray/inout/xw_excel.py index 560d42408..d9b93ab32 100644 --- a/larray/inout/xw_excel.py +++ b/larray/inout/xw_excel.py @@ -270,6 +270,7 @@ def close(self): if self.filepath is not None and os.path.isfile(self.xw_wkb.fullname): tmp_file = self.xw_wkb.fullname self.xw_wkb.close() + # XXX: do we check for this case earlier and act differently depending on overwrite? os.remove(self.filepath) os.rename(tmp_file, self.filepath) else: @@ -292,6 +293,10 @@ def __enter__(self): return self def __exit__(self, type_, value, traceback): + # XXX: we should probably also avoid closing the workbook for visible=True??? + # XXX: we might want to disallow using open_excel as a context manager (in __enter__) + # when we have nothing to do in close because it is kinda misleading (this might piss off + # users though, so maybe a warning would be better). if not self.active_workbook: self.close() From 87949d8f29a47ab4065eca3c756604cb3e6b9d3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Wed, 20 Mar 2019 11:05:18 +0100 Subject: [PATCH 44/44] WIP: added support for saving a file with a password --- larray/inout/xw_excel.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/larray/inout/xw_excel.py b/larray/inout/xw_excel.py index d9b93ab32..4eb3efc95 100644 --- a/larray/inout/xw_excel.py +++ b/larray/inout/xw_excel.py @@ -25,6 +25,7 @@ if xw is not None: from xlwings.conversion.pandas_conv import PandasDataFrameConverter + from xlwings.constants import FileFormat global_app = None @@ -257,12 +258,23 @@ def __delitem__(self, key): def sheet_names(self): return [s.name for s in self] - def save(self, path=None): + def save(self, path=None, password=None): # saved_path = self.xw_wkb.api.Path # was_saved = saved_path != '' if path is None and self.delayed_filepath is not None: path = self.delayed_filepath - self.xw_wkb.save(path=path) + + if password is not None: + if path is None: + raise ValueError("saving a Workbook with a password is only supported for workbooks with an " + "explicit path (given either when opening the workbook or here as the path " + "argument)") + realpath = os.path.realpath(path) + # XXX: this is probably Windows only + # using Password as keyword argument does not work ! + self.xw_wkb.api.SaveAs(realpath, FileFormat.xlOpenXMLWorkbook, password) + else: + self.xw_wkb.save(path=path) def close(self): # Close the workbook in Excel.