Skip to content

Commit f49732b

Browse files
committed
style fixes and TODO cleanup
1 parent 03c1184 commit f49732b

File tree

2 files changed

+82
-83
lines changed

2 files changed

+82
-83
lines changed

larray.py

+81-81
Original file line numberDiff line numberDiff line change
@@ -4,70 +4,66 @@
44
Matrix class
55
"""
66
#TODO
7-
# * implement new syntax
8-
7+
# * cleanup unit tests
98
# * easily add sum column for a dimension
10-
# a.sum(age=group_to_family(age[:]) + [':'])
11-
12-
# this should work (I think)
13-
# a.sum(age=[(l,) for l in age[:]] + slice(None))
14-
15-
# a.with_total(age=np.sum)
16-
# a.with_total(age=np.sum,np.avg) # potentially several totals
17-
# a.append(age=a.sum(age))
18-
# a.append(age='sum')
19-
# a.append(age=sum)
20-
# a.append(age='total=sum') # total = the name of the new label
21-
22-
# the following should work already (modulo the axis name -> axis num)
23-
# all_ages = a.sum(age=(':',))
24-
# np.concatenate((a, all_ages), axis=age)
25-
26-
# np.append(a, a.sum(age), axis=age)
27-
# a.append(a.sum(age), axis=age)
28-
29-
30-
# reorder an axis labels
31-
# modify read_csv format (last_column / time)
32-
# test to_csv: does it consume too much mem?
33-
# ---> test pandas (one dimension horizontally)
34-
# add labels in ValueGroups.__str__
35-
# xlsx export workbook without overwriting some sheets (charts)
36-
# implement x = bel.filter(age='0:10')
37-
# implement y = bel.sum(sex='H,F')
38-
39-
# XXX: allow naming "one-shot" groups? e.g:
40-
# regsum = bel.sum(lipro='P01,P02 = P01P02; : = all')
41-
42-
# XXX: __getitem__
43-
# * integer key on a non-integer label dimension is non-ambiguous:
44-
# - treat them like indices
45-
# * int key on in int label dimension is ambiguous:
46-
# - treat them like indices
47-
# OR
48-
# - treat them like values to lookup (len(key) has not relation with len(dim)
49-
# BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
50-
# * bool key on a non-bool dimension is non-ambiguous:
51-
# - treat them as a filter (len(key) must be == len(dim))
52-
# * bool key on a bool dimension is ambiguous:
53-
# - treat them as a filter (len(key) must be == len(dim) == 2)
54-
# eg [False, True], [True, False], [True, True], [False, False]
55-
# >>> I think this usage is unlikely to be used by users directly but might
56-
# - treat them like a subset of values to include in the cartesian product
57-
# eg, supposing we have a array of shape (bool[2], int[110], boo[2])
58-
# the key ([False], [1, 5, 9], [False, True]) would return an array
59-
# of shape [1, 3, 2]
60-
# OR
61-
# - treat them like values to lookup (len(key) has not relation with len(dim)
62-
# BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
63-
64-
65-
#TODO:
66-
# * unit TESTS !!!!
9+
# - a.sum(age=group_to_family(age[:]) + [':'])
10+
11+
# - this should work (I think)
12+
# - a.sum(age=[(l,) for l in age[:]] + slice(None))
13+
14+
# - a.with_total(age=np.sum)
15+
# - a.with_total(age=np.sum,np.avg) # potentially several totals
16+
# - a.append(age=a.sum(age))
17+
# - a.append(age='sum')
18+
# - a.append(age=sum)
19+
# - a.append(age='total=sum') # total = the name of the new label
20+
21+
# - the following should work already (modulo the axis name -> axis num)
22+
# - all_ages = a.sum(age=(':',))
23+
# - np.concatenate((a, all_ages), axis=age)
24+
25+
# - np.append(a, a.sum(age), axis=age)
26+
# - a.append(a.sum(age), axis=age)
27+
28+
# * reorder an axis labels
29+
# * modify read_csv format (last_column / time)
30+
# * test to_csv: does it consume too much mem?
31+
# ---> test pandas (one dimension horizontally)
32+
# * add labels in ValueGroups.__str__
33+
# * xlsx export workbook without overwriting some sheets (charts)
34+
# * implement x = bel.filter(age='0:10')
35+
# * implement y = bel.sum(sex='H,F')
36+
37+
# ? allow naming "one-shot" groups? e.g:
38+
# regsum = bel.sum(lipro='P01,P02 = P01P02; : = all')
39+
40+
# * review __getitem__ vs labels
41+
# o integer key on a non-integer label dimension is non-ambiguous:
42+
# => treat them like indices
43+
# o int key on in int label dimension is ambiguous:
44+
# => treat them like indices
45+
# OR
46+
# => treat them like values to lookup (len(key) has not relation with
47+
# len(dim) BUT if key is a tuple (nd-key), we have
48+
# len(dim0) == dim(dimX)
49+
# o bool key on a non-bool dimension is non-ambiguous:
50+
# - treat them as a filter (len(key) must be == len(dim))
51+
# o bool key on a bool dimension is ambiguous:
52+
# - treat them as a filter (len(key) must be == len(dim) == 2)
53+
# eg [False, True], [True, False], [True, True], [False, False]
54+
# >>> I think this usage is unlikely to be used by users directly but might
55+
# - treat them like a subset of values to include in the cartesian product
56+
# eg, supposing we have a array of shape (bool[2], int[110], boo[2])
57+
# the key ([False], [1, 5, 9], [False, True]) would return an array
58+
# of shape [1, 3, 2]
59+
# OR
60+
# - treat them like values to lookup (len(key) has not relation with len(dim)
61+
# BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
6762
# * evaluate the impact of label-only __getitem__: numpy/matplotlib/...
6863
# functions probably rely on __getitem__ with indices
64+
6965
# * docstring for all methods
70-
# * choose between subset and group. Having both is just confusing.
66+
# * choose between subset and group. Having both is just confusing, I think.
7167
# * check whether we could use np.array_repr/array_str (and
7268
# np.set_printoptions) instead of our own as_table/table2str
7369
# * IO functions: csv/hdf/excel?/...?
@@ -98,7 +94,7 @@
9894
# * test structured arrays
9995
# * review all method & argument names
10096
# * implement ValueGroup.__getitem__
101-
# ? allow __getitem__ with ValueGroups at any position since we usually know
97+
# ? allow __getitem__ with ValueGroups at any position since we know
10298
# which axis the ValueGroup correspond to. ie: allow bel[vla] even though
10399
# geo is not the first dimension of bel.
104100
# ? move "utils" to its own project (so that it is not duplicated between
@@ -115,8 +111,7 @@
115111
# la.axes['sex'].labels
116112
#
117113

118-
import csv
119-
from itertools import izip, product, chain
114+
from itertools import product, chain
120115
import string
121116
import sys
122117

@@ -125,7 +120,6 @@
125120

126121
import tables
127122

128-
from orderedset import OrderedSet
129123
from utils import prod, table2str, table2csv, table2iode, timed, unique
130124

131125

@@ -888,8 +882,6 @@ def to_excel(self, filename, sep=None):
888882
for row, data in enumerate(np.asarray(self)):
889883
worksheet.write_row(1+row, 1, data)
890884

891-
892-
893885
def transpose(self, *args):
894886
axes_names = set(axis.name for axis in args)
895887
missing_axes = [axis for axis in self.axes
@@ -921,7 +913,7 @@ def ToAv(self, filename):
921913

922914
def parse(s):
923915
#parameters can be strings or numbers
924-
if(isinstance(s, str)):
916+
if isinstance(s, str):
925917
s = s.lower()
926918
if s in ('0', '1', 'false', 'true'):
927919
return s in ('1', 'true')
@@ -935,6 +927,7 @@ def parse(s):
935927
else:
936928
return s
937929

930+
938931
def df_aslarray(df, na=np.nan):
939932
axes_labels = [list(unique(level[labels]))
940933
for level, labels in zip(df.index.levels, df.index.labels)]
@@ -969,35 +962,28 @@ def read_csv(filepath, index_col, sep=',', na=np.nan):
969962
# axes_names)
970963
df = pd.read_csv(filepath, index_col=index_col, sep=sep)
971964
return df_aslarray(df.reindex_axis(sorted(df.columns), axis=1), na)
972-
965+
966+
973967
def save_csv(l_array, filepath, sep=',', na=np.nan):
974968
df = l_array.as_dataframe()
975969
df.to_csv(filepath, sep=sep)
976970

977-
# HDF5 functions
971+
972+
# HDF5 functions
978973
def save_h5(l_array, name, filepath):
979974
df = l_array.as_dataframe()
980975
store = pd.HDFStore(filepath)
981976
store.put(name, df)
982977
store.close()
983978

979+
984980
def read_h5(name, filepath):
985981
store = pd.HDFStore(filepath)
986982
df = store.get(name)
987983
store.close()
988-
return df_aslarray(df)
984+
return df_aslarray(df)
985+
989986

990-
#EXCEL functions
991-
def save_excel(l_array, name, filepath):
992-
df = l_array.as_dataframe()
993-
writer = pd.ExcelWriter(filepath)
994-
df.to_excel(writer, name)
995-
writer.save()
996-
997-
def read_excel(name, filepath, index_col):
998-
df=pd.read_excel(filepath, name, index_col=index_col)
999-
return df_aslarray(df.reindex_axis(sorted(df.columns), axis=1))
1000-
1001987
def SaveMatrices(h5_filename):
1002988
try:
1003989
h5file = tables.openFile(h5_filename, mode="w", title="IodeMatrix")
@@ -1053,6 +1039,20 @@ def LoadMatrix(h5_filename, matname):
10531039
finally:
10541040
h5file.close()
10551041

1042+
1043+
# EXCEL functions
1044+
def save_excel(l_array, name, filepath):
1045+
df = l_array.as_dataframe()
1046+
writer = pd.ExcelWriter(filepath)
1047+
df.to_excel(writer, name)
1048+
writer.save()
1049+
1050+
1051+
def read_excel(name, filepath, index_col):
1052+
df=pd.read_excel(filepath, name, index_col=index_col)
1053+
return df_aslarray(df.reindex_axis(sorted(df.columns), axis=1))
1054+
1055+
10561056
if __name__ == '__main__':
10571057
#reg.Collapse('c:/tmp/reg.csv')
10581058
#reg.ToAv('reg.av')

test_la.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
import numpy as np
66

7-
from larray import LArray, Axis, union, OrderedSet, to_labels, ValueGroup, \
8-
srange, to_key
7+
from larray import LArray, Axis, ValueGroup, union, to_labels, to_key, srange
98

109
#XXX: maybe we should force value groups to use tuple and families (group of
1110
# groups to use lists, or vice versa, so that we know which is which)

0 commit comments

Comments
 (0)