Skip to content

Commit b34c2f7

Browse files
committed
fix #915 :
- use openpyxl engine instead of xlrd in PandasExcelHandler to read Excel files with the standard .xlsx extension - use xlsxwriter by default if installed in PandasExcelHandler - dropped tests for blank cells when reading Excel files with openpyxl - added xlrd in travis.yml to fix error for Python 3.6 (pandas is asking to install xlrd)
1 parent 7fc6420 commit b34c2f7

11 files changed

+93
-102
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ install:
3636
# - pandas >= 0.20.0 is required since commit 01669f2024a7bffe47cceec0a0fd845f71b6f7cc
3737
# (issue 702 : fixed bug when writing metadata using HDF format)
3838
- conda create -n travisci --yes python=${TRAVIS_PYTHON_VERSION:0:3}
39-
"numpy>=1.13" "pandas>=0.20" pytables matplotlib xlrd openpyxl
39+
"numpy>=1.13" "pandas>=0.20" pytables matplotlib openpyxl
4040
xlsxwriter "pytest>=3.5"
4141
- source activate travisci
4242
- pip install pytest-flake8

README.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,17 @@ For IO (HDF, Excel)
8989
- `xlwings <https://www.xlwings.org/>`__:
9090
recommended package to get benefit of all Excel features of LArray.
9191
Only available on Windows and Mac platforms.
92-
- `xlrd <http://www.python-excel.org/>`__:
93-
for reading data and formatting information from older Excel files (ie: .xls)
9492
- `openpyxl <http://www.python-excel.org/>`__:
9593
recommended package for reading and writing
9694
Excel 2010 files (ie: .xlsx)
9795
- `xlsxwriter <http://www.python-excel.org/>`__:
9896
alternative package for writing data, formatting
9997
information and, in particular, charts in the
10098
Excel 2010 format (ie: .xlsx)
99+
- `xlrd <http://www.python-excel.org/>`__:
100+
for reading data and formatting information from older Excel files (ie: .xls)
101+
- `xlwt <http://www.python-excel.org/>`__:
102+
for writing data and formatting information to older Excel files (ie: .xls)
101103
- `larray_eurostat <https://github.com/larray-project/larray_eurostat>`__:
102104
provides functions to easily download EUROSTAT files as larray objects.
103105
Currently limited to TSV files.

binder/environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ dependencies:
88
- pandas
99
- matplotlib
1010
- pytables
11-
- xlrd
1211
- openpyxl
12+
- xlsxwriter
1313
- bokeh
1414
- rise
1515
- pip:

doc/environment.yml

-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ dependencies:
88
- pandas
99
- matplotlib
1010
- pytables
11-
- xlrd
1211
- openpyxl
1312
- sphinx
1413
- numpydoc

doc/requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ numpy
22
pandas
33
matplotlib
44
tables # ==pytables
5-
xlrd
65
openpyxl
76
sphinx
87
numpydoc

doc/source/contribute.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ We'll now kick off a two-step process:
9898
conda config --add channels conda-forge
9999
100100
# Create and activate the build environment
101-
conda create -n larray_dev numpy pandas pytables pyqt qtpy matplotlib xlrd openpyxl xlsxwriter pytest pytest-pep8
101+
conda create -n larray_dev numpy pandas pytables pyqt qtpy matplotlib openpyxl xlsxwriter pytest pytest-pep8
102102
conda activate larray_dev
103103
104104
This will create the new environment, and not touch any of your existing environments,

larray/inout/excel.py

+31-20
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
import warnings
2+
import sys
3+
import os
24

35
import numpy as np
46
import pandas as pd
57
try:
68
import xlwings as xw
79
except ImportError:
810
xw = None
11+
try:
12+
import xlsxwriter
13+
except ImportError:
14+
xlsxwriter = None
915

1016
from larray.core.array import Array, asarray
1117
from larray.core.constants import nan
@@ -22,9 +28,9 @@
2228
__all__ = ['read_excel']
2329

2430

25-
# We use "# doctest: +SKIP" for all tests because they work only if xlrd (an *optional* dependency) is installed
2631
@deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1)
2732
@deprecate_kwarg('sheetname', 'sheet')
33+
# We use "# doctest: +SKIP" for all tests because they work only if openpyxl (an *optional* dependency) is installed
2834
def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, na=nan,
2935
sort_rows=False, sort_columns=False, wide=True, engine=None, range=slice(None), **kwargs):
3036
r"""
@@ -56,9 +62,11 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
5662
Whether or not to assume the array is stored in "wide" format.
5763
If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
5864
Defaults to True.
59-
engine : {'xlrd', 'xlwings'}, optional
60-
Engine to use to read the Excel file. If None (default), it will use 'xlwings' by default if the module is
61-
installed and relies on Pandas default reader otherwise.
65+
engine : {'xlwings', 'openpyxl', 'xlrd'}, optional
66+
Engine to use to read the Excel file.
67+
The 'xlrd' engine must be used to read Excel files with the old '.xls' extension.
68+
Either 'xlwings' or 'openpyxl' can be used to read Excel files with the standard '.xlsx' extension.
69+
Defaults to 'xlwings' if the module is installed, 'openpyxl' otherwise.
6270
range : str, optional
6371
Range to load the array from (only supported for the 'xlwings' engine). Defaults to slice(None) which loads
6472
the whole sheet, ignoring blank cells in the bottom right corner.
@@ -75,7 +83,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
7583
Read array from first sheet
7684
7785
>>> # The data below is derived from a subset of the demo_pjan table from Eurostat
78-
>>> read_excel(fname) # doctest: +SKIP
86+
>>> read_excel(fname) # doctest: +SKIP
7987
country gender\time 2013 2014 2015
8088
Belgium Male 5472856 5493792 5524068
8189
Belgium Female 5665118 5687048 5713206
@@ -87,7 +95,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
8795
Read array from a specific sheet
8896
8997
>>> # The data below is derived from a subset of the demo_fasec table from Eurostat
90-
>>> read_excel(fname, 'births') # doctest: +SKIP
98+
>>> read_excel(fname, 'births') # doctest: +SKIP
9199
country gender\time 2013 2014 2015
92100
Belgium Male 64371 64173 62561
93101
Belgium Female 61235 60841 59713
@@ -110,7 +118,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
110118
By default, cells associated with missing label combinations are filled with NaN. In that case, an int array
111119
is converted to a float array.
112120
113-
>>> read_excel(fname, sheet='population_missing_values') # doctest: +SKIP
121+
>>> read_excel(fname, sheet='population_missing_values') # doctest: +SKIP
114122
country gender\time 2013 2014 2015
115123
Belgium Male 5472856.0 5493792.0 5524068.0
116124
Belgium Female 5665118.0 5687048.0 5713206.0
@@ -121,7 +129,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
121129
122130
Using the ``fill_value`` argument, you can choose another value to use to fill missing cells.
123131
124-
>>> read_excel(fname, sheet='population_missing_values', fill_value=0) # doctest: +SKIP
132+
>>> read_excel(fname, sheet='population_missing_values', fill_value=0) # doctest: +SKIP
125133
country gender\time 2013 2014 2015
126134
Belgium Male 5472856 5493792 5524068
127135
Belgium Female 5665118 5687048 5713206
@@ -143,19 +151,19 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
143151
Germany Female 41142770 41210540 41362080
144152
145153
>>> # read the array stored in the sheet 'population_missing_axis_name' as is
146-
>>> arr = read_excel(fname, sheet='population_missing_axis_name') # doctest: +SKIP
154+
>>> arr = read_excel(fname, sheet='population_missing_axis_name') # doctest: +SKIP
147155
>>> # we expected a 3 x 2 x 3 array with data of type int
148156
>>> # but we got a 6 x 4 array with data of type object
149-
>>> arr.info # doctest: +SKIP
157+
>>> arr.info # doctest: +SKIP
150158
6 x 4
151159
country [6]: 'Belgium' 'Belgium' 'France' 'France' 'Germany' 'Germany'
152160
{1} [4]: 'gender' '2013' '2014' '2015'
153161
dtype: object
154162
memory used: 192 bytes
155163
>>> # using argument 'nb_axes', you can force the number of axes of the output array
156-
>>> arr = read_excel(fname, sheet='population_missing_axis_name', nb_axes=3) # doctest: +SKIP
164+
>>> arr = read_excel(fname, sheet='population_missing_axis_name', nb_axes=3) # doctest: +SKIP
157165
>>> # as expected, we have a 3 x 2 x 3 array with data of type int
158-
>>> arr.info # doctest: +SKIP
166+
>>> arr.info # doctest: +SKIP
159167
3 x 2 x 3
160168
country [3]: 'Belgium' 'France' 'Germany'
161169
gender [2]: 'Male' 'Female'
@@ -176,14 +184,14 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
176184
France 2015 66458153
177185
178186
>>> # to read arrays stored in 'narrow' format, you must pass wide=False to read_excel
179-
>>> read_excel(fname, 'population_narrow_format', wide=False) # doctest: +SKIP
187+
>>> read_excel(fname, 'population_narrow_format', wide=False) # doctest: +SKIP
180188
country\time 2013 2014 2015
181189
Belgium 11137974 11180840 11237274
182190
France 65600350 66165980 66458153
183191
184192
Extract array from a given range (xlwings only)
185193
186-
>>> read_excel(fname, 'population_births_deaths', range='A9:E15') # doctest: +SKIP
194+
>>> read_excel(fname, 'population_births_deaths', range='A9:E15') # doctest: +SKIP
187195
country gender\time 2013 2014 2015
188196
Belgium Male 64371 64173 62561
189197
Belgium Female 61235 60841 59713
@@ -200,7 +208,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan,
200208
sheet = _translate_sheet_name(sheet)
201209

202210
if engine is None:
203-
engine = 'xlwings' if xw is not None else None
211+
engine = 'xlwings' if xw is not None else 'openpyxl'
204212

205213
index_col = _get_index_col(nb_axes, index_col, wide)
206214

@@ -228,10 +236,13 @@ def __init__(self, fname, overwrite_file=False):
228236
super(PandasExcelHandler, self).__init__(fname, overwrite_file)
229237

230238
def _open_for_read(self):
231-
self.handle = pd.ExcelFile(self.fname)
239+
engine = 'openpyxl' if sys.version_info < (3, 7) else None
240+
self.handle = pd.ExcelFile(self.fname, engine=engine)
232241

233242
def _open_for_write(self):
234-
self.handle = pd.ExcelWriter(self.fname)
243+
_, ext = os.path.splitext(self.fname)
244+
engine = 'xlsxwriter' if ext == '.xlsx' and xlsxwriter is not None else None
245+
self.handle = pd.ExcelWriter(self.fname, engine=engine)
235246

236247
def list_items(self):
237248
sheet_names = self.handle.sheet_names
@@ -251,7 +262,7 @@ def _read_item(self, key, type, *args, **kwargs):
251262
raise TypeError()
252263

253264
def _dump_item(self, key, value, *args, **kwargs):
254-
kwargs['engine'] = 'xlsxwriter'
265+
kwargs['engine'] = self.handle.engine
255266
if isinstance(value, Array):
256267
value.to_excel(self.handle, key, *args, **kwargs)
257268
else:
@@ -260,15 +271,15 @@ def _dump_item(self, key, value, *args, **kwargs):
260271
def _read_metadata(self):
261272
sheet_meta = '__metadata__'
262273
if sheet_meta in self.handle.sheet_names:
263-
meta = read_excel(self.handle, sheet_meta, engine='xlrd', wide=False)
274+
meta = read_excel(self.handle, sheet_meta, engine=self.handle.engine, wide=False)
264275
return Metadata.from_array(meta)
265276
else:
266277
return Metadata()
267278

268279
def _dump_metadata(self, metadata):
269280
if len(metadata) > 0:
270281
metadata = asarray(metadata)
271-
metadata.to_excel(self.handle, '__metadata__', engine='xlsxwriter', wide=False, value_name='')
282+
metadata.to_excel(self.handle, '__metadata__', engine=self.handle.engine, wide=False, value_name='')
272283

273284
def save(self):
274285
pass

larray/tests/common.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
except ImportError:
1515
tables = None
1616
try:
17-
import xlrd
17+
import openpyxl
1818
except ImportError:
19-
xlrd = None
19+
openpyxl = None
2020
try:
2121
import xlsxwriter
2222
except ImportError:
@@ -145,7 +145,7 @@ def meta():
145145

146146
needs_xlwings = pytest.mark.skipif(xw is None, reason="xlwings is required for this test")
147147
needs_pytables = pytest.mark.skipif(tables is None, reason="pytables is required for this test")
148-
needs_xlrd = pytest.mark.skipif(xlrd is None, reason="xlrd is required for this test")
148+
needs_openpyxl = pytest.mark.skipif(openpyxl is None, reason="openpyxl is required for this test")
149149
needs_xlsxwriter = pytest.mark.skipif(xlsxwriter is None, reason="xlsxwriter is required for this test")
150150

151151
needs_python37 = pytest.mark.skipif(sys.version_info < (3, 7), reason="Python 3.7 is required for this test")

0 commit comments

Comments
 (0)