Skip to content

fix #291 + #293 + #313 : Session.save (0D arrays + Excel + overwrite file by default) #312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 14, 2017
Merged
28 changes: 28 additions & 0 deletions doc/source/changes/version_0_24.rst.inc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@
a0 0 1
a1 2 3

* added new boolean argument 'overwrite' to Session.save, Session.to_hdf, Session.to_excel and Session.to_pickle
methods (closes issue:`293`). If overwrite=True, the file is removed and replaced by a new one if it already existed.
This is the default behavior. If overwrite=False, its content is updated :

>>> arr1, arr2, arr3 = ndtest((2, 2)), ndtest(4), ndtest((3, 2))
>>> s = Session([('arr1', arr1), ('arr2', arr2), ('arr3', arr3)])

>>> # save arr1, arr2 and arr3 in file output.h5
>>> s.save('output.h5')

>>> # replace arr1 and create arr4 + put them in an second session
>>> arr1, arr4 = ndtest((3, 3)), ndtest((2, 3))
>>> s2 = Session([('arr1', arr1), ('arr4', arr4)])

>>> # replace arr1 and add arr4 in file output.h5
>>> s2.save('output.h5', overwrite=False)

>>> # erase content of 'output.h5' and save only arrays contained in the second session
>>> s2.save('output.h5')


Miscellaneous improvements
--------------------------
Expand Down Expand Up @@ -63,3 +83,11 @@ Fixes
* fixed getting float data instead of int when converting an Excel Sheet or Range to an larray or numpy array.

* fixed some warning messages to point to the correct line in user code.

* fixed crash of Session.save method when it contains a 0D array.
0D arrays are now skipped when saving a session (closes issue:`291`).

* fixed Session.save and Session.to_excel failing to create new Excel file
(it only worked if the file already existed). Closes issue:`313`.

* fixed Session.load(file, engine='pandas_excel') : axes were considered as anonymous.
54 changes: 41 additions & 13 deletions larray/core/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
names : list of str, optional
List of arrays to load. If `fname` is None, list of paths to CSV files.
Defaults to all valid objects present in the file/directory.
engine : str, optional
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
Load using `engine`. Defaults to 'auto' (use default engine for
the format guessed from the file extension).
display : bool, optional
Expand Down Expand Up @@ -234,7 +234,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
for k, v in arrays.items():
self[k] = v

def save(self, fname, names=None, engine='auto', display=False, **kwargs):
def save(self, fname, names=None, engine='auto', overwrite=True, display=False, **kwargs):
"""
Dumps all array objects from the current session to a file.

Expand All @@ -245,12 +245,14 @@ def save(self, fname, names=None, engine='auto', display=False, **kwargs):
names : list of str or None, optional
List of names of objects to dump. If `fname` is None, list of paths to CSV files.
Defaults to all objects present in the Session.
engine : str, optional
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
Dump using `engine`. Defaults to 'auto' (use default engine for
the format guessed from the file extension).
overwrite: bool, optional
Whether or not to overwrite an existing file, if any. Ignored for CSV files.
If False, file is updated. Defaults to True.
display : bool, optional
Whether or not to display which file is being worked on. Defaults
to False.
Whether or not to display which file is being worked on. Defaults to False.

Examples
--------
Expand All @@ -264,11 +266,20 @@ def save(self, fname, names=None, engine='auto', display=False, **kwargs):
Save only some arrays

>>> s.save('output.h5', ['arr1', 'arr3']) # doctest: +SKIP

Update file

>>> arr1, arr4 = ndtest((3, 3)), ndtest((2, 3)) # doctest: +SKIP
>>> s2 = Session([('arr1', arr1), ('arr4', arr4)]) # doctest: +SKIP
>>> # replace arr1 and add arr4 in file output.h5
>>> s2.save('output.h5', overwrite=False) # doctest: +SKIP
"""
if engine == 'auto':
_, ext = os.path.splitext(fname)
ext = ext.strip('.') if '.' in ext else 'csv'
engine = ext_default_engine[ext]
if overwrite and engine != ext_default_engine['csv'] and os.path.isfile(fname):
os.remove(fname)
handler_cls = handler_classes[engine]
handler = handler_cls(fname)
items = self.filter(kind=LArray).items()
Expand Down Expand Up @@ -326,7 +337,7 @@ def to_globals(self, names=None, depth=0, warn=True):
for k, v in items:
d[k] = v

def to_pickle(self, fname, names=None, *args, **kwargs):
def to_pickle(self, fname, names=None, overwrite=True, display=False, **kwargs):
"""
Dumps all array objects from the current session to a file using pickle.

Expand All @@ -340,6 +351,11 @@ def to_pickle(self, fname, names=None, *args, **kwargs):
names : list of str or None, optional
List of names of objects to dump. Defaults to all objects
present in the Session.
overwrite: bool, optional
Whether or not to overwrite an existing file, if any.
If False, file is updated. Defaults to True.
display : bool, optional
Whether or not to display which file is being worked on. Defaults to False.

Examples
--------
Expand All @@ -354,13 +370,13 @@ def to_pickle(self, fname, names=None, *args, **kwargs):

>>> s.to_pickle('output.pkl', ['arr1', 'arr3']) # doctest: +SKIP
"""
self.save(fname, names, ext_default_engine['pkl'], *args, **kwargs)
self.save(fname, names, ext_default_engine['pkl'], overwrite, display, **kwargs)

def dump(self, fname, names=None, engine='auto', display=False, **kwargs):
warnings.warn("Method dump is deprecated. Use method save instead.", DeprecationWarning, stacklevel=2)
self.save(fname, names, engine, display, **kwargs)

def to_hdf(self, fname, names=None, *args, **kwargs):
def to_hdf(self, fname, names=None, overwrite=True, display=False, **kwargs):
"""
Dumps all array objects from the current session to an HDF file.

Expand All @@ -371,6 +387,11 @@ def to_hdf(self, fname, names=None, *args, **kwargs):
names : list of str or None, optional
List of names of objects to dump. Defaults to all objects
present in the Session.
overwrite: bool, optional
Whether or not to overwrite an existing file, if any.
If False, file is updated. Defaults to True.
display : bool, optional
Whether or not to display which file is being worked on. Defaults to False.

Examples
--------
Expand All @@ -385,13 +406,13 @@ def to_hdf(self, fname, names=None, *args, **kwargs):

>>> s.to_hdf('output.h5', ['arr1', 'arr3']) # doctest: +SKIP
"""
self.save(fname, names, ext_default_engine['hdf'], *args, **kwargs)
self.save(fname, names, ext_default_engine['hdf'], overwrite, display, **kwargs)

def dump_hdf(self, fname, names=None, *args, **kwargs):
warnings.warn("Method dump_hdf is deprecated. Use method to_hdf instead.", DeprecationWarning, stacklevel=2)
self.to_hdf(fname, names, *args, **kwargs)

def to_excel(self, fname, names=None, *args, **kwargs):
def to_excel(self, fname, names=None, overwrite=True, display=False, **kwargs):
"""
Dumps all array objects from the current session to an Excel file.

Expand All @@ -402,6 +423,11 @@ def to_excel(self, fname, names=None, *args, **kwargs):
names : list of str or None, optional
List of names of objects to dump. Defaults to all objects
present in the Session.
overwrite: bool, optional
Whether or not to overwrite an existing file, if any.
If False, file is updated. Defaults to True.
display : bool, optional
Whether or not to display which file is being worked on. Defaults to False.

Examples
--------
Expand All @@ -416,13 +442,13 @@ def to_excel(self, fname, names=None, *args, **kwargs):

>>> s.to_excel('output.xlsx', ['arr1', 'arr3']) # doctest: +SKIP
"""
self.save(fname, names, ext_default_engine['xlsx'], *args, **kwargs)
self.save(fname, names, ext_default_engine['xlsx'], overwrite, display, **kwargs)

def dump_excel(self, fname, names=None, *args, **kwargs):
warnings.warn("Method dump_excel is deprecated. Use method to_excel instead.", DeprecationWarning, stacklevel=2)
self.to_excel(fname, names, *args, **kwargs)

def to_csv(self, fname, names=None, *args, **kwargs):
def to_csv(self, fname, names=None, display=False, **kwargs):
"""
Dumps all array objects from the current session to CSV files.

Expand All @@ -433,6 +459,8 @@ def to_csv(self, fname, names=None, *args, **kwargs):
names : list of str or None, optional
List of names of objects to dump. Defaults to all objects
present in the Session.
display : bool, optional
Whether or not to display which file is being worked on. Defaults to False.

Examples
--------
Expand All @@ -447,7 +475,7 @@ def to_csv(self, fname, names=None, *args, **kwargs):

>>> s.to_csv('./Output', ['arr1', 'arr3']) # doctest: +SKIP
"""
self.save(fname, names, ext_default_engine['csv'], *args, **kwargs)
self.save(fname, names, ext_default_engine['csv'], display=display, **kwargs)

def dump_csv(self, fname, names=None, *args, **kwargs):
warnings.warn("Method dump_csv is deprecated. Use method to_csv instead.", DeprecationWarning, stacklevel=2)
Expand Down
10 changes: 8 additions & 2 deletions larray/io/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections import OrderedDict
from pandas import ExcelWriter, ExcelFile, HDFStore

from larray.core.abc import ABCLArray
from larray.util.misc import pickle
from larray.io.excel import open_excel
from larray.io.array import df_aslarray, read_csv, read_hdf
Expand Down Expand Up @@ -114,6 +115,10 @@ def dump_arrays(self, key_values, *args, **kwargs):
display = kwargs.pop('display', False)
self._open_for_write()
for key, value in key_values:
if isinstance(value, ABCLArray) and value.ndim == 0:
if display:
print('Cannot dump {}. Dumping 0D arrays is currently not supported.'.format(key))
continue
if display:
print("dumping", key, "...", end=' ')
self._dump(key, value, *args, **kwargs)
Expand Down Expand Up @@ -164,7 +169,7 @@ def list(self):

def _read_array(self, key, *args, **kwargs):
df = self.handle.parse(key, *args, **kwargs)
return df_aslarray(df)
return df_aslarray(df, raw=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the relation with the rest? Is this an extra fix? If so it needs a test & changelog :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an extra fix for an issue discovered when I tried to update the IO tests for Session ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, this is not entirely correct. We should use: raw=index_col is None. But I admit the chance users ever hit this case is close to zero :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgot changelog, sorry.


def _dump(self, key, value, *args, **kwargs):
kwargs['engine'] = 'xlsxwriter'
Expand All @@ -182,7 +187,8 @@ def _open_for_read(self):
self.handle = open_excel(self.fname)

def _open_for_write(self):
self.handle = open_excel(self.fname)
overwrite_file = not os.path.isfile(self.fname)
self.handle = open_excel(self.fname, overwrite_file=overwrite_file)

def list(self):
return self.handle.sheet_names()
Expand Down
33 changes: 31 additions & 2 deletions larray/tests/test_session.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import, division, print_function

import os
from unittest import TestCase

import numpy as np
Expand Down Expand Up @@ -31,6 +32,7 @@ def setUp(self):
self.c = 'c'
self.d = {}
self.e = ndrange([(2, 'a0'), (3, 'a1')])
self.e2 = ndrange(('a=a0..a2', 'b=b0..b2'))
self.f = ndrange([(3, 'a0'), (2, 'a1')])
self.g = ndrange([(2, 'a0'), (4, 'a1')])
self.session = Session([
Expand Down Expand Up @@ -134,24 +136,37 @@ def test_names(self):

def test_h5_io(self):
fpath = abspath('test_session.h5')

self.session.save(fpath)

s = Session()
s.load(fpath)
# HDF does *not* keep ordering (ie, keys are always sorted)
self.assertEqual(list(s.keys()), ['e', 'f', 'g'])

# update an array (overwrite=False)
Session(e=self.e2).save(fpath, overwrite=False)
s.load(fpath)
self.assertEqual(list(s.keys()), ['e', 'f', 'g'])
assert_array_nan_equal(s['e'], self.e2)

s = Session()
s.load(fpath, ['e', 'f'])
self.assertEqual(list(s.keys()), ['e', 'f'])

def test_xlsx_pandas_io(self):
fpath = abspath('test_session.xlsx')
self.session.save(fpath, engine='pandas_excel')

s = Session()
s.load(fpath, engine='pandas_excel')
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])

# update an array (overwrite=False)
Session(e=self.e2).save(fpath, engine='pandas_excel', overwrite=False)
s.load(fpath, engine='pandas_excel')
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])
assert_array_nan_equal(s['e'], self.e2)

fpath = abspath('test_session_ef.xlsx')
self.session.save(fpath, ['e', 'f'], engine='pandas_excel')
s = Session()
Expand All @@ -161,12 +176,20 @@ def test_xlsx_pandas_io(self):
@pytest.mark.skipif(xw is None, reason="xlwings is not available")
def test_xlsx_xlwings_io(self):
fpath = abspath('test_session_xw.xlsx')
# test save when Excel file does not exist
self.session.save(fpath, engine='xlwings_excel')

s = Session()
s.load(fpath, engine='xlwings_excel')
# ordering is only kept if the file did not exist previously (otherwise the ordering is left intact)
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])

# update an array (overwrite=False)
Session(e=self.e2).save(fpath, engine='xlwings_excel', overwrite=False)
s.load(fpath, engine='xlwings_excel')
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])
assert_array_nan_equal(s['e'], self.e2)

fpath = abspath('test_session_ef_xw.xlsx')
self.session.save(fpath, ['e', 'f'], engine='xlwings_excel')
s = Session()
Expand All @@ -184,12 +207,18 @@ def test_csv_io(self):

def test_pickle_io(self):
fpath = abspath('test_session.pkl')

self.session.save(fpath)

s = Session()
s.load(fpath, engine='pickle')
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])

# update an array (overwrite=False)
Session(e=self.e2).save(fpath, overwrite=False)
s.load(fpath, engine='pickle')
self.assertEqual(list(s.keys()), ['e', 'g', 'f'])
assert_array_nan_equal(s['e'], self.e2)

def test_to_globals(self):
with pytest.warns(RuntimeWarning) as caught_warnings:
self.session.to_globals()
Expand Down