From 36d9fcb0f2ddf3d7735bfb3b45ff09ce46ce1319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 30 Jan 2015 14:30:29 +0100 Subject: [PATCH 1/2] progress on reading hdf via pandas but do not expand them --- larray/core.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/larray/core.py b/larray/core.py index 6e240f903..c85716ef6 100644 --- a/larray/core.py +++ b/larray/core.py @@ -1648,6 +1648,37 @@ def df_aslarray(df, sort_rows=True, sort_columns=True, **kwargs): return LArray(data, axes) +class DataFrameWrapper(object): + def __init__(self, df): + self.df = df + + def __getitem__(self, key): + return self.df[key] + + def __getattr__(self, key): + return getattr(self.df, key) + + +#TODO: implement sort_columns +def df_aslarray2(df, sort_rows=True, sort_columns=True, **kwargs): + axes_names = [decode(name, 'utf8') for name in df.index.names] + if axes_names == [None]: + last_axis = None, None + else: + last_axis = axes_names[-1].split('\\') + axes_names[-1] = last_axis[0] + #FIXME: hardcoded "time" + axes_names.append(last_axis[1] if len(last_axis) > 1 else 'time') + + axes_labels = df_labels(df, sort=sort_rows) + # pandas treats the "time" labels as column names (strings) so we need + # to convert them to values + axes_labels.append([parse(cell) for cell in df.columns.values]) + + axes = [Axis(name, labels) for name, labels in zip(axes_names, axes_labels)] + return LArray(DataFrameWrapper(df), axes) + + def read_csv(filepath, nb_index=0, index_col=[], sep=',', headersep=None, na=np.nan, sort_rows=True, sort_columns=True, **kwargs): """ @@ -1733,14 +1764,12 @@ def read_eurostat(filepath, **kwargs): return read_csv(filepath, sep='\t', headersep=',', **kwargs) -def read_hdf(filepath, key, na=np.nan, sort_rows=True, sort_columns=True, - **kwargs): +def read_hdf(filepath, key, sort_rows=True, sort_columns=True, **kwargs): """ read an LArray from a h5 file with the specified name """ df = pd.read_hdf(filepath, key, **kwargs) - return df_aslarray(df, sort_rows=sort_rows, sort_columns=sort_columns, - fill_value=na) + return df_aslarray2(df, sort_rows=sort_rows, sort_columns=sort_columns) def read_excel(filepath, sheetname=0, nb_index=0, index_col=[], From fa73cbb72f3569eedc60d5b20aa492e6a6c3e1e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20de=20Menten?= Date: Fri, 30 Jan 2015 16:56:56 +0100 Subject: [PATCH 2/2] DataFrameWrapper masquerades a dataframe as an ndarray just well enough that we can run our test suite not very interesting yet because the DF is still assumed to be dense and we pass it through asarray before indexing it --- larray/core.py | 40 +++++++++++++++++++++++++++++++++++++--- larray/tests/test_la.py | 20 ++++++++++++++++---- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/larray/core.py b/larray/core.py index c85716ef6..b4393c9c9 100644 --- a/larray/core.py +++ b/larray/core.py @@ -1365,7 +1365,7 @@ def opmethod(self, other): elif not np.isscalar(other): raise TypeError("unsupported operand type(s) for %s: '%s' " "and '%s'" % (opname, type(self), type(other))) - return LArray(super_method(self.data, other), self.axes) + return LArray(super_method(np.asarray(self), other), self.axes) opmethod.__name__ = fullname return opmethod @@ -1411,7 +1411,7 @@ def _unaryop(opname): super_method = getattr(np.ndarray, fullname) def opmethod(self): - return LArray(super_method(self.data), self.axes) + return LArray(super_method(np.asarray(self)), self.axes) opmethod.__name__ = fullname return opmethod @@ -1541,6 +1541,15 @@ def to_clipboard(self, *args, **kwargs): def plot(self, *args, **kwargs): self.df.plot(*args, **kwargs) + #XXX: one less indirection as we have all the info at this level? + # @property + # def shape(self): + # return tuple(len(a) for a in self.axes) + # + # @property + # def ndim(self): + # return len(self.axes) + @property def shape(self): return self.data.shape @@ -1565,7 +1574,7 @@ def __len__(self): return len(self.data) def __array__(self, dtype=None): - return self.data + return np.asarray(self.data) __array_priority__ = 100 @@ -1658,6 +1667,31 @@ def __getitem__(self, key): def __getattr__(self, key): return getattr(self.df, key) + @property + def dtype(self): + # assumes df is homogeneous ! + return self.df.dtypes[0] + + @property + def ndim(self): + return self.df.index.nlevels + 1 + + @property + def shape(self): + shape = [len(level) for level in self.df.index.levels] + shape.append(len(self.df.columns)) + return tuple(shape) + + def copy(self): + return DataFrameWrapper(self.df.copy()) + + # not caught by __getattr__? + def __len__(self): + return self.shape[0] + + def __array__(self, dtype=None): + return self.df.__array__(dtype).reshape(self.shape) + #TODO: implement sort_columns def df_aslarray2(df, sort_rows=True, sort_columns=True, **kwargs): diff --git a/larray/tests/test_la.py b/larray/tests/test_la.py index c12a25105..32e8de8ba 100644 --- a/larray/tests/test_la.py +++ b/larray/tests/test_la.py @@ -10,7 +10,7 @@ import larray from larray import (LArray, Axis, ValueGroup, union, to_ticks, to_key, srange, larray_equal, read_csv, read_hdf, df_aslarray, - zeros, zeros_like, AxisCollection) + zeros, zeros_like, AxisCollection, DataFrameWrapper) from larray.utils import array_equal, array_nan_equal @@ -509,11 +509,23 @@ def setUp(self): self.array = np.arange(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \ .astype(float) - self.larray = LArray(self.array, - axes=(self.age, self.geo, self.sex, self.lipro)) + idx = pd.MultiIndex.from_product([self.age.labels, self.geo.labels, + self.sex.labels]) + dfarray = self.array.reshape(116 * 44 * 2, 15) + df = pd.DataFrame(dfarray, idx, columns=self.lipro.labels) + wrapped = DataFrameWrapper(df) + self.larray = LArray(wrapped, (self.age, self.geo, self.sex, + self.lipro)) + # self.larray = LArray(self.array, + # axes=(self.age, self.geo, self.sex, self.lipro)) + # self.larray = read_hdf('c:/tmp/y.h5', 'y', sort_rows=False) self.small_data = np.arange(30).reshape(2, 15) - self.small = LArray(self.small_data, axes=(self.sex, self.lipro)) + df = pd.DataFrame(self.small_data, self.sex.labels, + columns=self.lipro.labels) + self.small = LArray(DataFrameWrapper(df), (self.sex, self.lipro)) + # self.small = LArray(self.small_data, axes=(self.sex, self.lipro)) + # self.small = read_hdf('c:/tmp/x.h5', 'x', sort_rows=False) def test_zeros(self): la = zeros((self.geo, self.age))