diff --git a/larray/core.py b/larray/core.py index 6e240f903..b4393c9c9 100644 --- a/larray/core.py +++ b/larray/core.py @@ -1365,7 +1365,7 @@ def opmethod(self, other): elif not np.isscalar(other): raise TypeError("unsupported operand type(s) for %s: '%s' " "and '%s'" % (opname, type(self), type(other))) - return LArray(super_method(self.data, other), self.axes) + return LArray(super_method(np.asarray(self), other), self.axes) opmethod.__name__ = fullname return opmethod @@ -1411,7 +1411,7 @@ def _unaryop(opname): super_method = getattr(np.ndarray, fullname) def opmethod(self): - return LArray(super_method(self.data), self.axes) + return LArray(super_method(np.asarray(self)), self.axes) opmethod.__name__ = fullname return opmethod @@ -1541,6 +1541,15 @@ def to_clipboard(self, *args, **kwargs): def plot(self, *args, **kwargs): self.df.plot(*args, **kwargs) + #XXX: one less indirection as we have all the info at this level? + # @property + # def shape(self): + # return tuple(len(a) for a in self.axes) + # + # @property + # def ndim(self): + # return len(self.axes) + @property def shape(self): return self.data.shape @@ -1565,7 +1574,7 @@ def __len__(self): return len(self.data) def __array__(self, dtype=None): - return self.data + return np.asarray(self.data) __array_priority__ = 100 @@ -1648,6 +1657,62 @@ def df_aslarray(df, sort_rows=True, sort_columns=True, **kwargs): return LArray(data, axes) +class DataFrameWrapper(object): + def __init__(self, df): + self.df = df + + def __getitem__(self, key): + return self.df[key] + + def __getattr__(self, key): + return getattr(self.df, key) + + @property + def dtype(self): + # assumes df is homogeneous ! + return self.df.dtypes[0] + + @property + def ndim(self): + return self.df.index.nlevels + 1 + + @property + def shape(self): + shape = [len(level) for level in self.df.index.levels] + shape.append(len(self.df.columns)) + return tuple(shape) + + def copy(self): + return DataFrameWrapper(self.df.copy()) + + # not caught by __getattr__? + def __len__(self): + return self.shape[0] + + def __array__(self, dtype=None): + return self.df.__array__(dtype).reshape(self.shape) + + +#TODO: implement sort_columns +def df_aslarray2(df, sort_rows=True, sort_columns=True, **kwargs): + axes_names = [decode(name, 'utf8') for name in df.index.names] + if axes_names == [None]: + last_axis = None, None + else: + last_axis = axes_names[-1].split('\\') + axes_names[-1] = last_axis[0] + #FIXME: hardcoded "time" + axes_names.append(last_axis[1] if len(last_axis) > 1 else 'time') + + axes_labels = df_labels(df, sort=sort_rows) + # pandas treats the "time" labels as column names (strings) so we need + # to convert them to values + axes_labels.append([parse(cell) for cell in df.columns.values]) + + axes = [Axis(name, labels) for name, labels in zip(axes_names, axes_labels)] + return LArray(DataFrameWrapper(df), axes) + + def read_csv(filepath, nb_index=0, index_col=[], sep=',', headersep=None, na=np.nan, sort_rows=True, sort_columns=True, **kwargs): """ @@ -1733,14 +1798,12 @@ def read_eurostat(filepath, **kwargs): return read_csv(filepath, sep='\t', headersep=',', **kwargs) -def read_hdf(filepath, key, na=np.nan, sort_rows=True, sort_columns=True, - **kwargs): +def read_hdf(filepath, key, sort_rows=True, sort_columns=True, **kwargs): """ read an LArray from a h5 file with the specified name """ df = pd.read_hdf(filepath, key, **kwargs) - return df_aslarray(df, sort_rows=sort_rows, sort_columns=sort_columns, - fill_value=na) + return df_aslarray2(df, sort_rows=sort_rows, sort_columns=sort_columns) def read_excel(filepath, sheetname=0, nb_index=0, index_col=[], diff --git a/larray/tests/test_la.py b/larray/tests/test_la.py index c12a25105..32e8de8ba 100644 --- a/larray/tests/test_la.py +++ b/larray/tests/test_la.py @@ -10,7 +10,7 @@ import larray from larray import (LArray, Axis, ValueGroup, union, to_ticks, to_key, srange, larray_equal, read_csv, read_hdf, df_aslarray, - zeros, zeros_like, AxisCollection) + zeros, zeros_like, AxisCollection, DataFrameWrapper) from larray.utils import array_equal, array_nan_equal @@ -509,11 +509,23 @@ def setUp(self): self.array = np.arange(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \ .astype(float) - self.larray = LArray(self.array, - axes=(self.age, self.geo, self.sex, self.lipro)) + idx = pd.MultiIndex.from_product([self.age.labels, self.geo.labels, + self.sex.labels]) + dfarray = self.array.reshape(116 * 44 * 2, 15) + df = pd.DataFrame(dfarray, idx, columns=self.lipro.labels) + wrapped = DataFrameWrapper(df) + self.larray = LArray(wrapped, (self.age, self.geo, self.sex, + self.lipro)) + # self.larray = LArray(self.array, + # axes=(self.age, self.geo, self.sex, self.lipro)) + # self.larray = read_hdf('c:/tmp/y.h5', 'y', sort_rows=False) self.small_data = np.arange(30).reshape(2, 15) - self.small = LArray(self.small_data, axes=(self.sex, self.lipro)) + df = pd.DataFrame(self.small_data, self.sex.labels, + columns=self.lipro.labels) + self.small = LArray(DataFrameWrapper(df), (self.sex, self.lipro)) + # self.small = LArray(self.small_data, axes=(self.sex, self.lipro)) + # self.small = read_hdf('c:/tmp/x.h5', 'x', sort_rows=False) def test_zeros(self): la = zeros((self.geo, self.age))