larray-project · gdementen · May 4, 2017 · May 3, 2017 · May 3, 2017 · May 3, 2017
diff --git a/doc/source/changes/version_0_22.rst.inc b/doc/source/changes/version_0_22.rst.inc
@@ -1,8 +1,14 @@
 New features
 ------------
 
-* implemented a new describe() method on arrays to give quick summary statistics (excluding NaN values). By default,
-  it includes the number of values, mean, standard deviation, minimum, 25, 50 and 75 percentiles and maximum.
+* viewer: added a menu bar with the ability to clear the current session, save all its arrays to a file (.h5, .xlsx,
+  .pkl or a directory containing multiple .csv files), and load arrays from such a file (closes :issue:`88`).
+
+  WARNING: Only array objects are currently saved. It means that scalars, functions or others non-LArray objects
+  defined in the console are *not* saved in the file.
+
+* implemented a new describe() method on arrays to give quick summary statistics. By default, it includes the number of
+  non-NaN values, the mean, standard deviation, minimum, 25, 50 and 75 percentiles and maximum.
 
     >>> arr = ndrange('gender=Male,Female;year=2014..2020').astype(float)
     >>> arr
@@ -13,7 +19,7 @@
     statistic | count | mean |               std | min |  25% | 50% |  75% |  max
               |  14.0 |  6.5 | 4.031128874149275 | 0.0 | 3.25 | 6.5 | 9.75 | 13.0
 
-  an optional keyword argument allow to specify different percentiles to include
+  an optional keyword argument allows to specify different percentiles to include
 
     >>> arr.describe(percentiles=[20, 40, 60, 80])
     statistic | count | mean |               std | min | 20% | 40% | 60% |  80% |  max
@@ -34,7 +40,7 @@
 
   This closes :issue:`184`.
 
-* implemented reindex allowing to change order of labels (and add new ones) of one or several axes:
+* implemented reindex allowing to change the order of labels and add/remove some of them to one or several axes:
 
     >>> arr = ndtest((2, 2))
     >>> arr
@@ -52,12 +58,26 @@
      a1 | -1 |  3 |  2
      a2 | -1 | -1 | -1
      a0 | -1 |  1 |  0
+
+  using reindex one can make an array compatible with another array which has more/less labels or with labels in a
+  different order:
+
     >>> arr2 = ndtest((3, 3))
-    >>> arr.reindex(arr2.axes, fill_value=-1)
+    >>> arr2
     a\\b | b0 | b1 | b2
-     a0 |  0 |  1 | -1
-     a1 |  2 |  3 | -1
-     a2 | -1 | -1 | -1
+     a0 |  0 |  1 |  2
+     a1 |  3 |  4 |  5
+     a2 |  6 |  7 |  8
+    >>> arr.reindex(arr2.axes, fill_value=0)
+    a\\b | b0 | b1 | b2
+     a0 |  0 |  1 |  0
+     a1 |  2 |  3 |  0
+     a2 |  0 |  0 |  0
+    >>> arr.reindex(arr2.axes, fill_value=0) + arr2
+    a\\b | b0 | b1 | b2
+     a0 |  0 |  2 |  2
+     a1 |  5 |  7 |  5
+     a2 |  6 |  7 |  8
 
   This closes :issue:`18`.
 
@@ -85,14 +105,6 @@
 
 * viewer: added possibility to delete an array by pressing Delete on keyboard (closes :issue:`116`).
 
-* viewer: added a menu bar 'File' with options 'New', 'Open', 'Save', 'SaveAs', 'Open Recent' and 'Quit'.
-
-  WARNING: Only LArray objects are saved.
-
-  (closes :issue:`88`)
-
-* renamed dump, dump_hdf, dump_excel and dump_csv methods of class Session as save, to_hdf, to_excel and to_csv
-  (closes :issue:`217`)
 
 .. _misc:
 
@@ -108,6 +120,8 @@ Miscellaneous improvements
 
   (closes :issue:`152`)
 
+* renamed Session.dump, dump_hdf, dump_excel and dump_csv to save, to_hdf, to_excel and to_csv (closes :issue:`217`).
+
 * changed default value of `ddof` argument for var and std functions from 0 to 1 (closes :issue:`190`).
 
 * implemented a new syntax for stack(): `stack({label1: value1, label2: value2}, axis)`
@@ -193,6 +207,16 @@ Miscellaneous improvements
         Axis(['c0', 'c1', 'c2'], 'column')
     ])
 
+* added possibility to delete an array from a session:
+
+    >>> s = Session({'a': ndtest((3, 3)), 'b': ndtest((2, 4)), 'c': ndtest((4, 2))})
+    >>> s.names
+    ['a', 'b', 'c']
+    >>> del s.b
+    >>> del s['c']
+    >>> s.names
+    ['a']
+
 * made create_sequential axis argument accept axis definitions in addition to Axis objects like, for example, using a
   string definition (closes :issue:`160`).
 
@@ -222,30 +246,22 @@ Miscellaneous improvements
 
   will autocomplete to `s.a_nice_test_array` and `s['a_<tab>` will be completed to `s['a_nice_test_array`
 
-* added possibility to delete an array from a session:
-
-    >>> s = Session({'a': ndtest((3, 3)), 'b': ndtest((2, 4)), 'c': ndtest((4, 2))})
-    >>> s.names
-    ['a', 'b', 'c']
-    >>> del s.b
-    >>> del s['c']
-    >>> s.names
-    ['a']
-
 * made warning messages for division by 0 and invalid values (usually caused by 0 / 0) point to the user code line,
   instead of the corresponding line in the larray module.
 
+* preserve order of arrays in a session when saving to/loading from an .xlsx file.
+
+* when creating a session from a directory containing CSV files, the directory may now contain other (non-CSV) files.
+
 
 Fixes
 -----
 
 * fixed keyword arguments such as `out`, `ddof`, ... for aggregation functions (closes :issue:`189`).
 
-* fixed percentile(_by) with multiple percentiles values (i.e. when argument `q` is a list/tuple).
-  (closes :issue:`192`)
+* fixed percentile(_by) with multiple percentiles values, i.e. when argument `q` is a list/tuple (closes :issue:`192`).
 
-* fixed group aggregates on integer arrays for median, percentile, var and std
-  (closes :issue:`193`).
+* fixed group aggregates on integer arrays for median, percentile, var and std (closes :issue:`193`).
 
 * fixed group sum over boolean arrays (closes :issue:`194`).
 
@@ -259,8 +275,4 @@ Fixes
 * fixed Workbook behavior in case of new workbook: the first added sheet replaces the default sheet `Sheet1`
   (closes :issue:`230`).
 
-* fixed order of arrays in sessions loaded from files. The order is now the same as in the file.
-
-* fixed loading session from CSV files in a directory. The directory may contain other non-csv files.
-
 * fixed with_axes warning to refer to set_axes instead of replace_axes.
diff --git a/larray/core.py b/larray/core.py
@@ -115,7 +115,7 @@
 from larray.utils import (table2str, size2str, unique, csv_open, long,
                           decode, basestring, unicode, bytes, izip, rproduct,
                           ReprString, duplicates, array_lookup2, strip_rows,
-                          skip_comment_cells, find_closing_chr, StringIO, PY3,
+                          skip_comment_cells, find_closing_chr, StringIO, PY2,
                           float_error_handler_factory)
 
 
@@ -1434,7 +1434,7 @@ def _is_key_type_compatible(self, key):
         # vice versa), so we shouldn't be more picky here than dict hashing
         str_key = key_kind in ('S', 'U')
         str_label = label_kind in ('S', 'U')
-        py2_str_match = not PY3 and str_key and str_label
+        py2_str_match = PY2 and str_key and str_label
         # object kind can match anything
         return key_kind == label_kind or \
                key_kind == 'O' or label_kind == 'O' or \
@@ -2018,11 +2018,7 @@ def _binop(opname):
         op_fullname = '__%s__' % opname
 
         # TODO: implement this in a delayed fashion for reference axes
-        if PY3:
-            def opmethod(self, other):
-                other_value = other.eval() if isinstance(other, Group) else other
-                return getattr(self.eval(), op_fullname)(other_value)
-        else:
+        if PY2:
             # workaround the fact slice objects do not have any __binop__ methods defined on Python2 (even though
             # the actual operations work on them).
             def opmethod(self, other):
@@ -2034,6 +2030,10 @@ def opmethod(self, other):
                     self_value = (self_value.start, self_value.stop, self_value.step)
                     other_value = (other_value.start, other_value.stop, other_value.step)
                 return getattr(self_value, op_fullname)(other_value)
+        else:
+            def opmethod(self, other):
+                other_value = other.eval() if isinstance(other, Group) else other
+                return getattr(self.eval(), op_fullname)(other_value)
 
         opmethod.__name__ = op_fullname
         return opmethod

diff --git a/larray/session.py b/larray/session.py
@@ -8,9 +8,9 @@
 import numpy as np
 from pandas import ExcelWriter, ExcelFile, HDFStore
 
-from .core import LArray, Axis, read_csv, read_hdf, df_aslarray, larray_equal, larray_nan_equal, get_axes
+from .core import LArray, Axis, read_csv, read_hdf, df_aslarray, larray_nan_equal, get_axes
 from .excel import open_excel
-from .utils import float_error_handler_factory
+from .utils import float_error_handler_factory, pickle
 
 
 def check_pattern(k, pattern):
@@ -81,8 +81,8 @@ def read_arrays(self, keys, *args, **kwargs):
 
         Returns
         -------
-        dict(str,LArray)
-            Dictionary containing names and arrays loaded from a file.
+        OrderedDict(str, LArray)
+            Dictionary containing the loaded arrays.
         """
         display = kwargs.pop('display', False)
         self._open_for_read()
@@ -106,11 +106,10 @@ def dump_arrays(self, key_values, *args, **kwargs):
 
         Parameters
         ----------
-        key_values : dict of paris (str, LArray)
-            Dictionary containing arrays to dump.
+        key_values : list of (str, LArray) pairs
+            Name and data of arrays to dump.
         kwargs :
-            * display: a small message is displayed to tell when
-              an array is started to be dump and when it's done.
+            * display: whether or not to display when the dump of each array is started/done.
         """
         display = kwargs.pop('display', False)
         self._open_for_write()
@@ -212,7 +211,7 @@ def _open_for_write(self):
     def list(self):
         # strip extension from files
         # TODO: also support fname pattern, eg. "dump_*.csv" (using glob)
-        return [os.path.splitext(fname)[0] for fname in os.listdir(self.fname) if '.csv' in fname]
+        return sorted([os.path.splitext(fname)[0] for fname in os.listdir(self.fname) if '.csv' in fname])
 
     def _read_array(self, key, *args, **kwargs):
         fpath = os.path.join(self.fname, '{}.csv'.format(key))
@@ -225,17 +224,41 @@ def close(self):
         pass
 
 
+class PickleHandler(FileHandler):
+    def _open_for_read(self):
+        with open(self.fname, 'rb') as f:
+            self.data = pickle.load(f)
+
+    def _open_for_write(self):
+        self.data = OrderedDict()
+
+    def list(self):
+        return self.data.keys()
+
+    def _read_array(self, key):
+        return self.data[key]
+
+    def _dump(self, key, value):
+        self.data[key] = value
+
+    def close(self):
+        with open(self.fname, 'wb') as f:
+            pickle.dump(self.data, f)
+
+
 handler_classes = {
+    'pickle': PickleHandler,
+    'pandas_csv': PandasCSVHandler,
     'pandas_hdf': PandasHDFHandler,
     'pandas_excel': PandasExcelHandler,
     'xlwings_excel': XLWingsHandler,
-    'pandas_csv': PandasCSVHandler
 }
 
 ext_default_engine = {
+    'csv': 'pandas_csv',
     'h5': 'pandas_hdf', 'hdf': 'pandas_hdf',
+    'pkl': 'pickle', 'pickle': 'pickle',
     'xls': 'xlwings_excel', 'xlsx': 'xlwings_excel',
-    'csv': 'pandas_csv'
 }
 
 
@@ -364,6 +387,9 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
         """
         Loads array objects from a file.
 
+        WARNING: never load a file using the pickle engine (.pkl or .pickle) from an untrusted source, as it can lead
+        to arbitrary code execution.
+
         Parameters
         ----------
         fname : str
@@ -415,13 +441,28 @@ def save(self, fname, names=None, engine='auto', display=False, **kwargs):
             engine = ext_default_engine[ext]
         handler_cls = handler_classes[engine]
         handler = handler_cls(fname)
-        filtered = self.filter(kind=LArray)
-        # not using .items() so that arrays are sorted
-        arrays = [(k, filtered[k]) for k in filtered.names]
+        items = self.filter(kind=LArray).items()
         if names is not None:
             names_set = set(names)
-            arrays = [(k, v) for k, v in arrays if k in names_set]
-        handler.dump_arrays(arrays, display=display, **kwargs)
+            items = [(k, v) for k, v in items if k in names_set]
+        handler.dump_arrays(items, display=display, **kwargs)
+
+    def to_pickle(self, fname, names=None, *args, **kwargs):
+        """
+        Dumps all array objects from the current session to a file using pickle.
+
+        WARNING: never load a pickle file (.pkl or .pickle) from an untrusted source, as it can lead to arbitrary code
+        execution.
+
+        Parameters
+        ----------
+        fname : str
+            Path for the dump.
+        names : list of str or None, optional
+            List of names of objects to dump. Defaults to all objects
+            present in the Session.
+        """
+        self.save(fname, names, ext_default_engine['pkl'], *args, **kwargs)
 
     def dump(self, fname, names=None, engine='auto', display=False, **kwargs):
         warnings.warn("Method dump is deprecated. Use method save instead.", DeprecationWarning, stacklevel=2)
@@ -497,15 +538,12 @@ def filter(self, pattern=None, kind=None):
         Session
             The filtered session.
         """
+        items = self._objects.items()
         if pattern is not None:
-            items = [(k, self._objects[k]) for k in self._objects.keys()
-                     if check_pattern(k, pattern)]
-        else:
-            items = self._objects.items()
+            items = [(k, v) for k, v in items if check_pattern(k, pattern)]
         if kind is not None:
-            return Session([(k, v) for k, v in items if isinstance(v, kind)])
-        else:
-            return Session(items)
+            items = [(k, v) for k, v in items if isinstance(v, kind)]
+        return Session(items)
 
     @property
     def names(self):
@@ -521,6 +559,7 @@ def names(self):
     def copy(self):
         """Returns a copy of the session.
         """
+        # this actually *does* a copy of the internal mapping (the mapping is not reused-as is)
         return Session(self._objects)
 
     def keys(self):

diff --git a/larray/tests/test_la.py b/larray/tests/test_la.py
@@ -3,8 +3,8 @@
 import os.path
 import sys
 from unittest import TestCase
-import pytest
 
+import pytest
 import numpy as np
 import pandas as pd
 
@@ -4224,7 +4224,9 @@ def test_stack(self):
         assert_array_equal(res, expected)
 
 if __name__ == "__main__":
-    import doctest
-    from larray import core
-    doctest.testmod(core)
-    unittest.main()
+    # import doctest
+    # import unittest
+    # from larray import core
+    # doctest.testmod(core)
+    # unittest.main()
+    pytest.main()