diff --git a/doc/source/changes/version_0_28.rst.inc b/doc/source/changes/version_0_28.rst.inc index bfa76bbf2..fa7907f49 100644 --- a/doc/source/changes/version_0_28.rst.inc +++ b/doc/source/changes/version_0_28.rst.inc @@ -289,6 +289,17 @@ Miscellaneous improvements Argument `transpose` has a different purpose than `wide` and is mainly useful to allow multiple axes as header when exporting arrays with more than 2 dimensions. Closes :issue:`575` and :issue:`371`. +* added argument `wide` to `read_csv` and `read_excel` functions. If False, the array to be loaded is assumed to + be stored in "narrow" format: + + >>> # assuming the array was saved using command: arr.to_excel('my_file.xlsx', wide=False) + >>> read_excel('my_file.xlsx', wide=False) + a\b b0 b1 b2 + a0 0 1 2 + a1 3 4 5 + + Closes :issue:`574`. + * added argument `name` to `to_series` method allowing to set a name to the Pandas Series returned by the method. * added argument `value_name` to `to_csv` and `to_excel` allowing to change the default name ('value') to diff --git a/larray/inout/array.py b/larray/inout/array.py index 052370732..a609fecdb 100644 --- a/larray/inout/array.py +++ b/larray/inout/array.py @@ -186,7 +186,7 @@ def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfo return LArray(data, axes) -def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, **kwargs): +def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, **kwargs): """ Prepare Pandas DataFrame and then convert it into LArray. @@ -205,6 +205,10 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header parse_header : bool, optional Whether or not to parse columns labels. Pandas treats column labels as strings. If True, column labels are converted into int, float or boolean when possible. Defaults to True. + wide : bool, optional + Whether or not to assume the array is stored in "wide" format. + If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column. + Defaults to True. Returns ------- @@ -218,19 +222,37 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header # irrespective of the actual data dimensionality if raw: columns = df.columns.values.tolist() - try: - # take the first column which contains '\' - pos_last = next(i for i, v in enumerate(columns) if isinstance(v, basestring) and '\\' in v) - except StopIteration: - # we assume first column will not contain data - pos_last = 0 - - # This is required to handle int column names (otherwise we can simply use column positions in set_index). - # This is NOT the same as df.columns[list(range(...))] ! - index_columns = [df.columns[i] for i in range(pos_last + 1)] - # TODO: we should pass a flag to df_aslarray so that we can use inplace=True here - # df.set_index(index_columns, inplace=True) - df = df.set_index(index_columns) + if wide: + try: + # take the first column which contains '\' + pos_last = next(i for i, v in enumerate(columns) if isinstance(v, basestring) and '\\' in v) + except StopIteration: + # we assume first column will not contain data + pos_last = 0 + + # This is required to handle int column names (otherwise we can simply use column positions in set_index). + # This is NOT the same as df.columns[list(range(...))] ! + index_columns = [df.columns[i] for i in range(pos_last + 1)] + df.set_index(index_columns, inplace=True) + else: + index_columns = [df.columns[i] for i in range(len(df.columns) - 1)] + df.set_index(index_columns, inplace=True) + series = df[df.columns[-1]] + if isinstance(series.index, pd.core.index.MultiIndex): + fill_value = kwargs.get('fill_value', np.nan) + # TODO: use argument sort=False when it will be available + # (see https://github.com/pandas-dev/pandas/issues/15105) + df = series.unstack(level=-1, fill_value=fill_value) + # pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns + labels = df_labels(series, sort=False) + index = pd.MultiIndex.from_tuples(list(product(*labels[:-1])), names=series.index.names[:-1]) + columns = labels[-1] + df = df.reindex(index=index, columns=columns, fill_value=fill_value) + else: + series.name = series.index.name + if sort_rows: + raise ValueError('sort_rows=True is not valid for 1D arrays. Please use sort_columns instead.') + return from_series(series, sort_rows=sort_columns) # handle 1D if len(df) == 1 and (pd.isnull(df.index.values[0]) or @@ -249,9 +271,24 @@ def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header unfold_last_axis_name=unfold_last_axis_name, **kwargs) +def _get_index_col(nb_axes=None, index_col=None, wide=True): + if not wide: + if nb_axes is not None or index_col is not None: + raise ValueError("`nb_axes` or `index_col` argument cannot be used when `wide` argument is False") + + if nb_axes is not None and index_col is not None: + raise ValueError("cannot specify both `nb_axes` and `index_col`") + elif nb_axes is not None: + index_col = list(range(nb_axes - 1)) + elif isinstance(index_col, int): + index_col = [index_col] + + return index_col + + @deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1) def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headersep=None, fill_value=np.nan, - na=np.nan, sort_rows=False, sort_columns=False, dialect='larray', **kwargs): + na=np.nan, sort_rows=False, sort_columns=False, wide=True, dialect='larray', **kwargs): """ Reads csv file and returns an array with the contents. @@ -288,6 +325,10 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse sort_columns : bool, optional Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). Defaults to False. + wide : bool, optional + Whether or not to assume the array is stored in "wide" format. + If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column. + Defaults to True. dialect : 'classic' | 'larray' | 'liam2', optional Name of dialect. Defaults to 'larray'. **kwargs @@ -298,22 +339,54 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse Examples -------- - >>> from larray import ndrange >>> tmpdir = getfixture('tmpdir') >>> fname = os.path.join(tmpdir.strpath, 'test.csv') >>> a = ndtest('nat=BE,FO;sex=M,F') - + >>> a + nat\\sex M F + BE 0 1 + FO 2 3 >>> a.to_csv(fname) + >>> with open(fname) as f: + ... print(f.read().strip()) + nat\\sex,M,F + BE,0,1 + FO,2,3 >>> read_csv(fname) nat\\sex M F BE 0 1 FO 2 3 + + Sort columns + >>> read_csv(fname, sort_columns=True) nat\\sex F M BE 1 0 FO 3 2 - >>> fname = 'no_axis_name.csv' + + Read array saved in "narrow" format (wide=False) + + >>> a.to_csv(fname, wide=False) + >>> with open(fname) as f: + ... print(f.read().strip()) + nat,sex,value + BE,M,0 + BE,F,1 + FO,M,2 + FO,F,3 + >>> read_csv(fname, wide=False) + nat\\sex M F + BE 0 1 + FO 2 3 + + Specify the number of axes of the output array (useful when the name of the last axis is implicit) + >>> a.to_csv(fname, dialect='classic') + >>> with open(fname) as f: + ... print(f.read().strip()) + nat,M,F + BE,0,1 + FO,2,3 >>> read_csv(fname, nb_axes=2) nat\\{1} M F BE 0 1 @@ -341,12 +414,7 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse kwargs['header'] = 1 kwargs['comment'] = '#' - if nb_axes is not None and index_col is not None: - raise ValueError("cannot specify both nb_axes and index_col") - elif nb_axes is not None: - index_col = list(range(nb_axes - 1)) - elif isinstance(index_col, int): - index_col = [index_col] + index_col = _get_index_col(nb_axes, index_col, wide) if headersep is not None: if index_col is None: @@ -369,7 +437,7 @@ def read_csv(filepath_or_buffer, nb_axes=None, index_col=None, sep=',', headerse df.index.names = combined_axes_names.split(headersep) raw = False - return df_aslarray(df, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, raw=raw) + return df_aslarray(df, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, raw=raw, wide=wide) def read_tsv(filepath_or_buffer, **kwargs): @@ -430,7 +498,7 @@ def read_hdf(filepath_or_buffer, key, fill_value=np.nan, na=np.nan, sort_rows=Fa @deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1) @deprecate_kwarg('sheetname', 'sheet') def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=np.nan, na=np.nan, - sort_rows=False, sort_columns=False, engine=None, **kwargs): + sort_rows=False, sort_columns=False, wide=True, engine=None, **kwargs): """ Reads excel file from sheet name and returns an LArray with the contents @@ -456,6 +524,10 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=np.na sort_columns : bool, optional Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). Defaults to False. + wide : bool, optional + Whether or not to assume the array is stored in "wide" format. + If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column. + Defaults to True. engine : {'xlrd', 'xlwings'}, optional Engine to use to read the Excel file. If None (default), it will use 'xlwings' by default if the module is installed and relies on Pandas default reader otherwise. @@ -471,12 +543,7 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=np.na if engine is None: engine = 'xlwings' if xw is not None else None - if nb_axes is not None and index_col is not None: - raise ValueError("cannot specify both nb_axes and index_col") - elif nb_axes is not None: - index_col = list(range(nb_axes - 1)) - elif isinstance(index_col, int): - index_col = [index_col] + index_col = _get_index_col(nb_axes, index_col, wide) if engine == 'xlwings': if kwargs: @@ -485,11 +552,11 @@ def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=np.na from larray.inout.excel import open_excel with open_excel(filepath) as wb: return wb[sheet].load(index_col=index_col, fill_value=fill_value, sort_rows=sort_rows, - sort_columns=sort_columns) + sort_columns=sort_columns, wide=wide) else: df = pd.read_excel(filepath, sheet, index_col=index_col, engine=engine, **kwargs) return df_aslarray(df, sort_rows=sort_rows, sort_columns=sort_columns, raw=index_col is None, - fill_value=fill_value) + fill_value=fill_value, wide=wide) @deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1) @@ -518,7 +585,7 @@ def read_sas(filepath, nb_axes=None, index_col=None, fill_value=np.nan, na=np.na @deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1) -def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows=False, sort_columns=False): +def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows=False, sort_columns=False, wide=True): """ initialize array from a list of lists (lines) @@ -541,6 +608,10 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows= sort_columns : bool, optional Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). Defaults to False. + wide : bool, optional + Whether or not to assume the array is stored in "wide" format. + If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column. + Defaults to True. Returns ------- @@ -558,6 +629,9 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows= sex\\year 1991 1992 1993 M 0 1 2 F 3 4 5 + + Read array with missing values + `fill_value` argument + >>> from_lists([['sex', 'nat\\\\year', 1991, 1992, 1993], ... [ 'M', 'BE', 1, 0, 0], ... [ 'M', 'FO', 2, 0, 0], @@ -567,6 +641,19 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows= M FO 2.0 0.0 0.0 F BE 0.0 0.0 1.0 F FO nan nan nan + + >>> from_lists([['sex', 'nat\\\\year', 1991, 1992, 1993], + ... [ 'M', 'BE', 1, 0, 0], + ... [ 'M', 'FO', 2, 0, 0], + ... [ 'F', 'BE', 0, 0, 1]], fill_value=42) + sex nat\\year 1991 1992 1993 + M BE 1 0 0 + M FO 2 0 0 + F BE 0 0 1 + F FO 42 42 42 + + Specify the number of axes of the array to be read + >>> from_lists([['sex', 'nat', 1991, 1992, 1993], ... [ 'M', 'BE', 1, 0, 0], ... [ 'M', 'FO', 2, 0, 0], @@ -576,33 +663,37 @@ def from_lists(data, nb_axes=None, index_col=None, fill_value=np.nan, sort_rows= M FO 2.0 0.0 0.0 F BE 0.0 0.0 1.0 F FO nan nan nan - >>> from_lists([['sex', 'nat\\\\year', 1991, 1992, 1993], - ... [ 'M', 'BE', 1, 0, 0], - ... [ 'M', 'FO', 2, 0, 0], - ... [ 'F', 'BE', 0, 0, 1]], fill_value=42) + + Read array saved in "narrow" format (wide=False) + + >>> from_lists([['sex', 'nat', 'year', 'value'], + ... [ 'M', 'BE', 1991, 1 ], + ... [ 'M', 'BE', 1992, 0 ], + ... [ 'M', 'BE', 1993, 0 ], + ... [ 'M', 'FO', 1991, 2 ], + ... [ 'M', 'FO', 1992, 0 ], + ... [ 'M', 'FO', 1993, 0 ], + ... [ 'F', 'BE', 1991, 0 ], + ... [ 'F', 'BE', 1992, 0 ], + ... [ 'F', 'BE', 1993, 1 ]], wide=False) sex nat\\year 1991 1992 1993 - M BE 1 0 0 - M FO 2 0 0 - F BE 0 0 1 - F FO 42 42 42 + M BE 1.0 0.0 0.0 + M FO 2.0 0.0 0.0 + F BE 0.0 0.0 1.0 + F FO nan nan nan """ - if nb_axes is not None and index_col is not None: - raise ValueError("cannot specify both nb_axes and index_col") - elif nb_axes is not None: - index_col = list(range(nb_axes - 1)) - elif isinstance(index_col, int): - index_col = [index_col] + index_col = _get_index_col(nb_axes, index_col, wide) df = pd.DataFrame(data[1:], columns=data[0]) if index_col is not None: df.set_index([df.columns[c] for c in index_col], inplace=True) return df_aslarray(df, raw=index_col is None, parse_header=False, sort_rows=sort_rows, sort_columns=sort_columns, - fill_value=fill_value) + fill_value=fill_value, wide=wide) @deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1) -def from_string(s, nb_axes=None, index_col=None, sep=' ', **kwargs): +def from_string(s, nb_axes=None, index_col=None, sep=' ', wide=True, **kwargs): """Create an array from a multi-line string. Parameters @@ -618,6 +709,10 @@ def from_string(s, nb_axes=None, index_col=None, sep=' ', **kwargs): Positions of columns for the n-1 first axes (ex. [0, 1, 2, 3]). Defaults to None (see nb_axes above). sep : str delimiter used to split each line into cells. + wide : bool, optional + Whether or not to assume the array is stored in "wide" format. + If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column. + Defaults to True. \**kwargs See arguments of Pandas read_csv function. @@ -671,4 +766,5 @@ def from_string(s, nb_axes=None, index_col=None, sep=' ', **kwargs): BE 0 1 FO 2 3 """ - return read_csv(StringIO(s), nb_axes=nb_axes, index_col=index_col, sep=sep, skipinitialspace=True, **kwargs) + return read_csv(StringIO(s), nb_axes=nb_axes, index_col=index_col, sep=sep, skipinitialspace=True, + wide=wide, **kwargs) diff --git a/larray/inout/excel.py b/larray/inout/excel.py index 63418b8f6..7c69f4205 100644 --- a/larray/inout/excel.py +++ b/larray/inout/excel.py @@ -418,9 +418,9 @@ def __setattr__(self, key, value): setattr(self.xw_sheet, key, value) def load(self, header=True, convert_float=True, nb_index=None, index_col=None, fill_value=np.nan, - sort_rows=False, sort_columns=False): + sort_rows=False, sort_columns=False, wide=True): return self[:].load(header=header, convert_float=convert_float, nb_index=nb_index, index_col=index_col, - sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value) + fill_value=fill_value, sort_rows=sort_rows, sort_columns=sort_columns, wide=wide) # TODO: generalize to more than 2 dimensions or scrap it def array(self, data, row_labels=None, column_labels=None, names=None): @@ -547,7 +547,7 @@ def __str__(self): __repr__ = __str__ def load(self, header=True, convert_float=True, nb_index=None, index_col=None, fill_value=np.nan, - sort_rows=False, sort_columns=False): + sort_rows=False, sort_columns=False, wide=True): if not self.ndim: return LArray([]) @@ -555,7 +555,7 @@ def load(self, header=True, convert_float=True, nb_index=None, index_col=None, f if header: return from_lists(list_data, nb_index=nb_index, index_col=index_col, fill_value=fill_value, - sort_rows=sort_rows, sort_columns=sort_columns) + sort_rows=sort_rows, sort_columns=sort_columns, wide=wide) else: return LArray(list_data) diff --git a/larray/tests/data/test.xlsx b/larray/tests/data/test.xlsx index 940c0700d..8a957d23c 100644 Binary files a/larray/tests/data/test.xlsx and b/larray/tests/data/test.xlsx differ diff --git a/larray/tests/data/test1d.csv b/larray/tests/data/test1d.csv index b7f741243..16be91619 100644 --- a/larray/tests/data/test1d.csv +++ b/larray/tests/data/test1d.csv @@ -1,2 +1,2 @@ -time,2007,2010,2013 +a,a0,a1,a2 ,0,1,2 diff --git a/larray/tests/data/test1d_narrow.csv b/larray/tests/data/test1d_narrow.csv new file mode 100644 index 000000000..eff558a1e --- /dev/null +++ b/larray/tests/data/test1d_narrow.csv @@ -0,0 +1,4 @@ +a,value +a0,0 +a1,1 +a2,2 diff --git a/larray/tests/data/test2d.csv b/larray/tests/data/test2d.csv index 99fb4a754..cd6ae8d37 100644 --- a/larray/tests/data/test2d.csv +++ b/larray/tests/data/test2d.csv @@ -1,3 +1,4 @@ -a\b,0,1,2 -0,0,1,2 -1,3,4,5 +a\b,b0,b1 +1,0,1 +2,2,3 +3,4,5 diff --git a/larray/tests/data/test2d_classic.csv b/larray/tests/data/test2d_classic.csv index a007c19a8..a5b63f948 100644 --- a/larray/tests/data/test2d_classic.csv +++ b/larray/tests/data/test2d_classic.csv @@ -1,6 +1,4 @@ -age,2007,2010,2013 -0,3722,3395,3347 -1,338,316,323 -2,2878,2791,2822 -3,1121,1037,976 -4,4073,4161,4429 \ No newline at end of file +a,b0,b1,b2 +a0,0,1,2 +a1,3,4,5 +a2,6,7,8 \ No newline at end of file diff --git a/larray/tests/data/test2d_narrow.csv b/larray/tests/data/test2d_narrow.csv new file mode 100644 index 000000000..2f726083e --- /dev/null +++ b/larray/tests/data/test2d_narrow.csv @@ -0,0 +1,7 @@ +a,b,value +1,b0,0 +1,b1,1 +2,b0,2 +2,b1,3 +3,b0,4 +3,b1,5 diff --git a/larray/tests/data/test3d.csv b/larray/tests/data/test3d.csv index 993846f6d..7816f5a64 100644 --- a/larray/tests/data/test3d.csv +++ b/larray/tests/data/test3d.csv @@ -1,9 +1,7 @@ -age,sex\time,2015,2016,2017 -0,F,0.5,1.5,2.5 -0,M,3.5,4.5,5.5 -1,F,6.5,7.5,8.5 -1,M,9.5,10.5,11.5 -2,F,12.5,13.5,14.5 -2,M,15.5,16.5,17.5 -3,F,18.5,19.5,20.5 -3,M,21.5,22.5,23.5 +a,b\c,c0,c1,c2 +1,b0,0,1,2 +1,b1,3,4,5 +2,b0,6,7,8 +2,b1,9,10,11 +3,b0,12,13,14 +3,b1,15,16,17 diff --git a/larray/tests/data/test3d_narrow.csv b/larray/tests/data/test3d_narrow.csv new file mode 100644 index 000000000..9e3c75014 --- /dev/null +++ b/larray/tests/data/test3d_narrow.csv @@ -0,0 +1,19 @@ +a,b,c,value +1,b0,c0,0 +1,b0,c1,1 +1,b0,c2,2 +1,b1,c0,3 +1,b1,c1,4 +1,b1,c2,5 +2,b0,c0,6 +2,b0,c1,7 +2,b0,c2,8 +2,b1,c0,9 +2,b1,c1,10 +2,b1,c2,11 +3,b0,c0,12 +3,b0,c1,13 +3,b0,c2,14 +3,b1,c0,15 +3,b1,c1,16 +3,b1,c2,17 diff --git a/larray/tests/data/test5d.csv b/larray/tests/data/test5d.csv deleted file mode 100644 index 98777ae04..000000000 --- a/larray/tests/data/test5d.csv +++ /dev/null @@ -1,41 +0,0 @@ -arr,age,sex,nat\time,2007,2010,2013 -1,0,F,1,3722,3395,3347 -1,0,F,2,338,316,323 -1,0,H,1,2878,2791,2822 -1,0,H,2,1121,1037,976 -1,1,F,1,4073,4161,4429 -1,1,F,2,1561,1463,1467 -1,1,H,1,3507,3741,3366 -1,1,H,2,2052,2052,2118 -1,2,F,1,4807,4868,4852 -1,2,F,2,3785,3508,3172 -1,2,H,1,4464,4692,4839 -1,2,H,2,95,100,103 -1,3,F,1,1135,1023,928 -1,3,F,2,3121,3180,2900 -1,3,H,1,3850,4118,3716 -1,3,H,2,4174,4131,4290 -1,4,F,1,4072,3775,3781 -1,4,F,2,4392,4305,4215 -1,4,H,1,1657,1492,1360 -1,4,H,2,916,973,895 -2,0,F,1,141,135,141 -2,0,F,2,3713,3952,3895 -2,0,H,1,1616,1499,1556 -2,0,H,2,1895,2029,1946 -2,1,F,1,3272,3501,3584 -2,1,F,2,793,725,679 -2,1,H,1,4766,5007,5242 -2,1,H,2,2943,2670,2525 -2,2,F,1,3594,3922,4019 -2,2,F,2,4150,3813,3576 -2,2,H,1,1264,1244,1333 -2,2,H,2,1922,1970,1972 -2,3,F,1,380,400,387 -2,3,F,2,3860,3787,3431 -2,3,H,1,2962,3112,2962 -2,3,H,2,2679,2599,2490 -2,4,F,1,4159,4210,4461 -2,4,F,2,28,27,28 -2,4,H,1,2887,2972,2837 -2,4,H,2,2712,2827,2806 diff --git a/larray/tests/data/test_narrow.xlsx b/larray/tests/data/test_narrow.xlsx new file mode 100644 index 000000000..010c47986 Binary files /dev/null and b/larray/tests/data/test_narrow.xlsx differ diff --git a/larray/tests/data/testint_labels.csv b/larray/tests/data/testint_labels.csv new file mode 100644 index 000000000..56d877e39 --- /dev/null +++ b/larray/tests/data/testint_labels.csv @@ -0,0 +1,10 @@ +a,b\c,0,1,2 +0,0,0,1,2 +0,1,3,4,5 +0,2,6,7,8 +1,0,9,10,11 +1,1,12,13,14 +1,2,15,16,17 +2,0,18,19,20 +2,1,21,22,23 +2,2,24,25,26 diff --git a/larray/tests/data/testmissing_values.csv b/larray/tests/data/testmissing_values.csv new file mode 100644 index 000000000..f05c14ea9 --- /dev/null +++ b/larray/tests/data/testmissing_values.csv @@ -0,0 +1,5 @@ +a,b\c,c0,c1,c2 +1,b0,0,1,2 +1,b1,3,4,5 +2,b1,9,10,11 +3,b0,12,13,14 diff --git a/larray/tests/data/testmissing_values_narrow.csv b/larray/tests/data/testmissing_values_narrow.csv new file mode 100644 index 000000000..1663c2a6a --- /dev/null +++ b/larray/tests/data/testmissing_values_narrow.csv @@ -0,0 +1,12 @@ +a,b,c,value +1,b0,c0,0 +1,b0,c1,1 +1,b0,c2,2 +1,b1,c0,3 +1,b1,c1,4 +1,b1,c2,5 +2,b1,c0,9 +2,b1,c2,11 +3,b0,c0,12 +3,b0,c1,13 +3,b0,c2,14 diff --git a/larray/tests/data/testunsorted_narrow.csv b/larray/tests/data/testunsorted_narrow.csv new file mode 100644 index 000000000..295e82af8 --- /dev/null +++ b/larray/tests/data/testunsorted_narrow.csv @@ -0,0 +1,19 @@ +a,b,c,value +3,b1,c2,0 +3,b1,c1,1 +3,b1,c0,2 +3,b0,c2,3 +3,b0,c1,4 +3,b0,c0,5 +2,b1,c2,6 +2,b1,c1,7 +2,b1,c0,8 +2,b0,c2,9 +2,b0,c1,10 +2,b0,c0,11 +1,b1,c2,12 +1,b1,c1,13 +1,b1,c0,14 +1,b0,c2,15 +1,b0,c1,16 +1,b0,c0,17 diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index c0199d9b8..985b9ffa3 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -88,6 +88,17 @@ def setUp(self): self.small_data = np.arange(30).reshape(2, 15) self.small = LArray(self.small_data, axes=(self.sex, self.lipro), title=self.small_title) + self.io_1d = ndtest(3) + self.io_2d = ndtest("a=1..3; b=b0,b1") + self.io_3d = ndtest("a=1..3; b=b0,b1; c=c0..c2") + self.io_int_labels = ndtest("a=0..2; b=0..2; c=0..2") + self.io_unsorted = ndtest("a=3..1; b=b1,b0; c=c2..c0") + self.io_missing_values = ndtest("a=1..3; b=b0,b1; c=c0..c2", dtype=float) + self.io_missing_values[2, 'b0'] = np.nan + self.io_missing_values[3, 'b1'] = np.nan + self.io_narrow_missing_values = self.io_missing_values.copy() + self.io_narrow_missing_values[2, 'b1', 'c1'] = np.nan + @pytest.fixture(autouse=True) def setup(self, tmpdir): self.tmpdir = tmpdir.strpath @@ -2590,27 +2601,19 @@ def test_from_string(self): def test_read_csv(self): res = read_csv(inputpath('test1d.csv')) - assert_array_equal(res, ndtest('time=2007,2010,2013')) + assert_array_equal(res, self.io_1d) res = read_csv(inputpath('test2d.csv')) - assert_array_equal(res, ndtest('a=0,1;b=0,1,2')) + assert_array_equal(res, self.io_2d) res = read_csv(inputpath('test3d.csv')) - expected = ndtest('age=0..3;sex=F,M;time=2015..2017') + 0.5 - assert_array_equal(res, expected) + assert_array_equal(res, self.io_3d) - la = read_csv(inputpath('test5d.csv')) - self.assertEqual(la.ndim, 5) - self.assertEqual(la.shape, (2, 5, 2, 2, 3)) - self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + res = read_csv(inputpath('testint_labels.csv')) + assert_array_equal(res, self.io_int_labels) - la = read_csv(inputpath('test2d_classic.csv')) - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', None]) - assert_array_equal(la[0, :], [3722, 3395, 3347]) + res = read_csv(inputpath('test2d_classic.csv')) + assert_array_equal(res, ndtest("a=a0..a2; b0..b2")) la = read_csv(inputpath('test1d_liam2.csv'), dialect='liam2') self.assertEqual(la.ndim, 1) @@ -2622,8 +2625,11 @@ def test_read_csv(self): self.assertEqual(la.ndim, 5) self.assertEqual(la.shape, (2, 5, 2, 2, 3)) self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], [3722, 3395, 3347]) + + # missing values + res = read_csv(inputpath('testmissing_values.csv')) + assert_array_nan_equal(res, self.io_missing_values) # test StringIO res = read_csv(StringIO('a,1,2\n,0,1\n')) @@ -2633,6 +2639,26 @@ def test_read_csv(self): res = read_csv(StringIO('a,a2,a0,a1\n,2,0,1\n'), sort_columns=True) assert_array_equal(res, ndtest(3)) + ################# + # narrow format # + ################# + res = read_csv(inputpath('test1d_narrow.csv'), wide=False) + assert_array_equal(res, self.io_1d) + + res = read_csv(inputpath('test2d_narrow.csv'), wide=False) + assert_array_equal(res, self.io_2d) + + res = read_csv(inputpath('test3d_narrow.csv'), wide=False) + assert_array_equal(res, self.io_3d) + + # missing values + res = read_csv(inputpath('testmissing_values_narrow.csv'), wide=False) + assert_array_nan_equal(res, self.io_narrow_missing_values) + + # unsorted values + res = read_csv(inputpath('testunsorted_narrow.csv'), wide=False) + assert_array_equal(res, self.io_unsorted) + def test_read_eurostat(self): la = read_eurostat(inputpath('test5d_eurostat.csv')) self.assertEqual(la.ndim, 5) @@ -2644,59 +2670,67 @@ def test_read_eurostat(self): @pytest.mark.skipif(xw is None, reason="xlwings is not available") def test_read_excel_xlwings(self): - la = read_excel(inputpath('test.xlsx'), '1d') - self.assertEqual(la.ndim, 1) - self.assertEqual(la.shape, (3,)) - self.assertEqual(la.axes.names, ['time']) - assert_array_equal(la, [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '1d') + assert_array_equal(arr, self.io_1d) - la = read_excel(inputpath('test.xlsx'), '2d') - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', 'time']) - assert_array_equal(la[0, :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '2d') + assert_array_equal(arr, self.io_2d) - la = read_excel(inputpath('test.xlsx'), '3d') - self.assertEqual(la.ndim, 3) - self.assertEqual(la.shape, (5, 2, 3)) - self.assertEqual(la.axes.names, ['age', 'sex', 'time']) - assert_array_equal(la[0, 'F', :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '3d') + assert_array_equal(arr, self.io_3d) - la = read_excel(inputpath('test.xlsx'), '5d') - self.assertEqual(la.ndim, 5) - self.assertEqual(la.shape, (2, 5, 2, 2, 3)) - self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), 'int_labels') + assert_array_equal(arr, self.io_int_labels) - la = read_excel(inputpath('test.xlsx'), '2d_classic') - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', None]) - assert_array_equal(la[0, :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '2d_classic') + assert_array_equal(arr, ndtest("a=a0..a2; b0..b2")) # passing a Group as sheet arg axis = Axis('dim=1d,2d,3d,5d') - la = read_excel(inputpath('test.xlsx'), axis['1d']) - self.assertEqual(la.ndim, 1) - self.assertEqual(la.shape, (3,)) - self.assertEqual(la.axes.names, ['time']) - assert_array_equal(la, [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), axis['1d']) + assert_array_equal(arr, self.io_1d) - # fill_value argument - la = read_excel(inputpath('test.xlsx'), 'missing_values', fill_value=42) - assert la.ndim == 3 - assert la.shape == (5, 2, 3) - assert la.axes.names == ['age', 'sex', 'time'] - assert_array_equal(la[1, 'H', :], [42, 42, 42]) - assert_array_equal(la[4, 'F', :], [42, 42, 42]) + # missing rows + fill_value argument + arr = read_excel(inputpath('test.xlsx'), 'missing_values', fill_value=42) + expected = self.io_missing_values.copy() + expected[isnan(expected)] = 42 + assert_array_equal(arr, expected) + + ################# + # narrow format # + ################# + arr = read_excel(inputpath('test_narrow.xlsx'), '1d', wide=False) + assert_array_equal(arr, self.io_1d) + + arr = read_excel(inputpath('test_narrow.xlsx'), '2d', wide=False) + assert_array_equal(arr, self.io_2d) + + arr = read_excel(inputpath('test_narrow.xlsx'), '3d', wide=False) + assert_array_equal(arr, self.io_3d) + + # missing rows + fill_value argument + arr = read_excel(inputpath('test_narrow.xlsx'), 'missing_values', fill_value=42, wide=False) + expected = self.io_narrow_missing_values.copy() + expected[isnan(expected)] = 42 + assert_array_equal(arr, expected) + + # unsorted values + arr = read_excel(inputpath('test_narrow.xlsx'), 'unsorted', wide=False) + assert_array_equal(arr, self.io_unsorted) + + ############################## + # invalid keyword argument # + ############################## - # invalid keyword argument with self.assertRaisesRegexp(TypeError, "'dtype' is an invalid keyword argument for this function when using " "the xlwings backend"): read_excel(inputpath('test.xlsx'), engine='xlwings', dtype=float) + ################# + # blank cells # + ################# + # Excel sheet with blank cells on right/bottom border of the array to read fpath = inputpath('test_blank_cells.xlsx') good = read_excel(fpath, 'good') @@ -2714,64 +2748,65 @@ def test_read_excel_xlwings(self): assert_array_equal(bad4, good2) def test_read_excel_pandas(self): - la = read_excel(inputpath('test.xlsx'), '1d', engine='xlrd') - self.assertEqual(la.ndim, 1) - self.assertEqual(la.shape, (3,)) - self.assertEqual(la.axes.names, ['time']) - assert_array_equal(la, [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '1d', engine='xlrd') + assert_array_equal(arr, self.io_1d) - la = read_excel(inputpath('test.xlsx'), '2d', nb_axes=2, engine='xlrd') - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', 'time']) - assert_array_equal(la[0, :], [3722, 3395, 3347]) - - la = read_excel(inputpath('test.xlsx'), '2d', engine='xlrd') - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', 'time']) - assert_array_equal(la[0, :], [3722, 3395, 3347]) - - la = read_excel(inputpath('test.xlsx'), '3d', index_col=[0, 1], engine='xlrd') - self.assertEqual(la.ndim, 3) - self.assertEqual(la.shape, (5, 2, 3)) - self.assertEqual(la.axes.names, ['age', 'sex', 'time']) - assert_array_equal(la[0, 'F', :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '2d', nb_axes=2, engine='xlrd') + assert_array_equal(arr, self.io_2d) - la = read_excel(inputpath('test.xlsx'), '3d', engine='xlrd') - self.assertEqual(la.ndim, 3) - self.assertEqual(la.shape, (5, 2, 3)) - self.assertEqual(la.axes.names, ['age', 'sex', 'time']) - assert_array_equal(la[0, 'F', :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '2d', engine='xlrd') + assert_array_equal(arr, self.io_2d) - la = read_excel(inputpath('test.xlsx'), '5d', nb_axes=5, engine='xlrd') - self.assertEqual(la.ndim, 5) - self.assertEqual(la.shape, (2, 5, 2, 2, 3)) - self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '3d', index_col=[0, 1], engine='xlrd') + assert_array_equal(arr, self.io_3d) - la = read_excel(inputpath('test.xlsx'), '5d', engine='xlrd') - self.assertEqual(la.ndim, 5) - self.assertEqual(la.shape, (2, 5, 2, 2, 3)) - self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), '3d', engine='xlrd') + assert_array_equal(arr, self.io_3d) - la = read_excel(inputpath('test.xlsx'), '2d_classic', engine='xlrd') - self.assertEqual(la.ndim, 2) - self.assertEqual(la.shape, (5, 3)) - self.assertEqual(la.axes.names, ['age', None]) - assert_array_equal(la[0, :], [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), 'int_labels', engine='xlrd') + assert_array_equal(arr, self.io_int_labels) + + arr = read_excel(inputpath('test.xlsx'), '2d_classic', engine='xlrd') + assert_array_equal(arr, ndtest("a=a0..a2; b0..b2")) # passing a Group as sheet arg axis = Axis('dim=1d,2d,3d,5d') - la = read_excel(inputpath('test.xlsx'), axis['1d'], engine='xlrd') - self.assertEqual(la.ndim, 1) - self.assertEqual(la.shape, (3,)) - self.assertEqual(la.axes.names, ['time']) - assert_array_equal(la, [3722, 3395, 3347]) + arr = read_excel(inputpath('test.xlsx'), axis['1d'], engine='xlrd') + assert_array_equal(arr, self.io_1d) + + # missing rows + fill_value argument + arr = read_excel(inputpath('test.xlsx'), 'missing_values', fill_value=42, engine='xlrd') + expected = self.io_missing_values.copy() + expected[isnan(expected)] = 42 + assert_array_equal(arr, expected) + + ################# + # narrow format # + ################# + arr = read_excel(inputpath('test_narrow.xlsx'), '1d', wide=False, engine='xlrd') + assert_array_equal(arr, self.io_1d) + + arr = read_excel(inputpath('test_narrow.xlsx'), '2d', wide=False, engine='xlrd') + assert_array_equal(arr, self.io_2d) + + arr = read_excel(inputpath('test_narrow.xlsx'), '3d', wide=False, engine='xlrd') + assert_array_equal(arr, self.io_3d) + + # missing rows + fill_value argument + arr = read_excel(inputpath('test_narrow.xlsx'), 'missing_values', + fill_value=42, wide=False, engine='xlrd') + expected = self.io_narrow_missing_values + expected[isnan(expected)] = 42 + assert_array_equal(arr, expected) + + # unsorted values + arr = read_excel(inputpath('test_narrow.xlsx'), 'unsorted', wide=False, engine='xlrd') + assert_array_equal(arr, self.io_unsorted) + + ################# + # blank cells # + ################# # Excel sheet with blank cells on right/bottom border of the array to read fpath = inputpath('test_blank_cells.xlsx') @@ -3172,31 +3207,26 @@ def test_from_frame(self): assert_array_equal(la[0, 'F', :], [3722, 3395, 3347]) def test_to_csv(self): - la = read_csv(inputpath('test5d.csv')) - self.assertEqual(la.ndim, 5) - self.assertEqual(la.shape, (2, 5, 2, 2, 3)) - self.assertEqual(la.axes.names, ['arr', 'age', 'sex', 'nat', 'time']) - assert_array_equal(la[X.arr[1], 0, 'F', X.nat[1], :], - [3722, 3395, 3347]) + arr = self.io_3d.copy() - la.to_csv(self.tmp_path('out.csv')) - result = ['arr,age,sex,nat\\time,2007,2010,2013\n', - '1,0,F,1,3722,3395,3347\n', - '1,0,F,2,338,316,323\n'] + arr.to_csv(self.tmp_path('out.csv')) + result = ['a,b\\c,c0,c1,c2\n', + '1,b0,0,1,2\n', + '1,b1,3,4,5\n'] with open(self.tmp_path('out.csv')) as f: self.assertEqual(f.readlines()[:3], result) # stacked data (one column containing all the values and another column listing the context of the value) - la.to_csv(self.tmp_path('out.csv'), wide=False) - result = ['arr,age,sex,nat,time,value\n', - '1,0,F,1,2007,3722\n', - '1,0,F,1,2010,3395\n'] + arr.to_csv(self.tmp_path('out.csv'), wide=False) + result = ['a,b,c,value\n', + '1,b0,c0,0\n', + '1,b0,c1,1\n'] with open(self.tmp_path('out.csv')) as f: self.assertEqual(f.readlines()[:3], result) - la = ndtest([Axis('time=2015..2017')]) - la.to_csv(self.tmp_path('test_out1d.csv')) - result = ['time,2015,2016,2017\n', + arr = self.io_1d.copy() + arr.to_csv(self.tmp_path('test_out1d.csv')) + result = ['a,a0,a1,a2\n', ',0,1,2\n'] with open(self.tmp_path('test_out1d.csv')) as f: self.assertEqual(f.readlines(), result) @@ -3521,11 +3551,9 @@ def test_open_excel(self): assert_array_equal(res.axes[2].labels, a3.axes[2].labels) with open_excel(inputpath('test.xlsx')) as wb: + expected = ndtest("a=a0..a2; b0..b2") res = wb['2d_classic'].load() - assert res.ndim == 2 - assert res.shape == (5, 3) - assert res.axes.names == ['age', None] - assert_array_equal(res[0, :], [3722, 3395, 3347]) + assert_array_equal(res, expected) # 3) without headers # ==================