4
4
Matrix class
5
5
"""
6
6
#TODO
7
- # * implement new syntax
8
-
7
+ # * cleanup unit tests
9
8
# * easily add sum column for a dimension
10
- # a.sum(age=group_to_family(age[:]) + [':'])
11
-
12
- # this should work (I think)
13
- # a.sum(age=[(l,) for l in age[:]] + slice(None))
14
-
15
- # a.with_total(age=np.sum)
16
- # a.with_total(age=np.sum,np.avg) # potentially several totals
17
- # a.append(age=a.sum(age))
18
- # a.append(age='sum')
19
- # a.append(age=sum)
20
- # a.append(age='total=sum') # total = the name of the new label
21
-
22
- # the following should work already (modulo the axis name -> axis num)
23
- # all_ages = a.sum(age=(':',))
24
- # np.concatenate((a, all_ages), axis=age)
25
-
26
- # np.append(a, a.sum(age), axis=age)
27
- # a.append(a.sum(age), axis=age)
28
-
29
-
30
- # reorder an axis labels
31
- # modify read_csv format (last_column / time)
32
- # test to_csv: does it consume too much mem?
33
- # ---> test pandas (one dimension horizontally)
34
- # add labels in ValueGroups.__str__
35
- # xlsx export workbook without overwriting some sheets (charts)
36
- # implement x = bel.filter(age='0:10')
37
- # implement y = bel.sum(sex='H,F')
38
-
39
- # XXX: allow naming "one-shot" groups? e.g:
40
- # regsum = bel.sum(lipro='P01,P02 = P01P02; : = all')
41
-
42
- # XXX: __getitem__
43
- # * integer key on a non-integer label dimension is non-ambiguous:
44
- # - treat them like indices
45
- # * int key on in int label dimension is ambiguous:
46
- # - treat them like indices
47
- # OR
48
- # - treat them like values to lookup (len(key) has not relation with len(dim)
49
- # BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
50
- # * bool key on a non-bool dimension is non-ambiguous:
51
- # - treat them as a filter (len(key) must be == len(dim))
52
- # * bool key on a bool dimension is ambiguous:
53
- # - treat them as a filter (len(key) must be == len(dim) == 2)
54
- # eg [False, True], [True, False], [True, True], [False, False]
55
- # >>> I think this usage is unlikely to be used by users directly but might
56
- # - treat them like a subset of values to include in the cartesian product
57
- # eg, supposing we have a array of shape (bool[2], int[110], boo[2])
58
- # the key ([False], [1, 5, 9], [False, True]) would return an array
59
- # of shape [1, 3, 2]
60
- # OR
61
- # - treat them like values to lookup (len(key) has not relation with len(dim)
62
- # BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
63
-
64
-
65
- #TODO:
66
- # * unit TESTS !!!!
9
+ # - a.sum(age=group_to_family(age[:]) + [':'])
10
+
11
+ # - this should work (I think)
12
+ # - a.sum(age=[(l,) for l in age[:]] + slice(None))
13
+
14
+ # - a.with_total(age=np.sum)
15
+ # - a.with_total(age=np.sum,np.avg) # potentially several totals
16
+ # - a.append(age=a.sum(age))
17
+ # - a.append(age='sum')
18
+ # - a.append(age=sum)
19
+ # - a.append(age='total=sum') # total = the name of the new label
20
+
21
+ # - the following should work already (modulo the axis name -> axis num)
22
+ # - all_ages = a.sum(age=(':',))
23
+ # - np.concatenate((a, all_ages), axis=age)
24
+
25
+ # - np.append(a, a.sum(age), axis=age)
26
+ # - a.append(a.sum(age), axis=age)
27
+
28
+ # * reorder an axis labels
29
+ # * modify read_csv format (last_column / time)
30
+ # * test to_csv: does it consume too much mem?
31
+ # ---> test pandas (one dimension horizontally)
32
+ # * add labels in ValueGroups.__str__
33
+ # * xlsx export workbook without overwriting some sheets (charts)
34
+ # * implement x = bel.filter(age='0:10')
35
+ # * implement y = bel.sum(sex='H,F')
36
+
37
+ # ? allow naming "one-shot" groups? e.g:
38
+ # regsum = bel.sum(lipro='P01,P02 = P01P02; : = all')
39
+
40
+ # * review __getitem__ vs labels
41
+ # o integer key on a non-integer label dimension is non-ambiguous:
42
+ # => treat them like indices
43
+ # o int key on in int label dimension is ambiguous:
44
+ # => treat them like indices
45
+ # OR
46
+ # => treat them like values to lookup (len(key) has not relation with
47
+ # len(dim) BUT if key is a tuple (nd-key), we have
48
+ # len(dim0) == dim(dimX)
49
+ # o bool key on a non-bool dimension is non-ambiguous:
50
+ # - treat them as a filter (len(key) must be == len(dim))
51
+ # o bool key on a bool dimension is ambiguous:
52
+ # - treat them as a filter (len(key) must be == len(dim) == 2)
53
+ # eg [False, True], [True, False], [True, True], [False, False]
54
+ # >>> I think this usage is unlikely to be used by users directly but might
55
+ # - treat them like a subset of values to include in the cartesian product
56
+ # eg, supposing we have a array of shape (bool[2], int[110], boo[2])
57
+ # the key ([False], [1, 5, 9], [False, True]) would return an array
58
+ # of shape [1, 3, 2]
59
+ # OR
60
+ # - treat them like values to lookup (len(key) has not relation with len(dim)
61
+ # BUT if key is a tuple (nd-key), we have len(dim0) == dim(dimX)
67
62
# * evaluate the impact of label-only __getitem__: numpy/matplotlib/...
68
63
# functions probably rely on __getitem__ with indices
64
+
69
65
# * docstring for all methods
70
- # * choose between subset and group. Having both is just confusing.
66
+ # * choose between subset and group. Having both is just confusing, I think .
71
67
# * check whether we could use np.array_repr/array_str (and
72
68
# np.set_printoptions) instead of our own as_table/table2str
73
69
# * IO functions: csv/hdf/excel?/...?
98
94
# * test structured arrays
99
95
# * review all method & argument names
100
96
# * implement ValueGroup.__getitem__
101
- # ? allow __getitem__ with ValueGroups at any position since we usually know
97
+ # ? allow __getitem__ with ValueGroups at any position since we know
102
98
# which axis the ValueGroup correspond to. ie: allow bel[vla] even though
103
99
# geo is not the first dimension of bel.
104
100
# ? move "utils" to its own project (so that it is not duplicated between
115
111
# la.axes['sex'].labels
116
112
#
117
113
118
- import csv
119
- from itertools import izip , product , chain
114
+ from itertools import product , chain
120
115
import string
121
116
import sys
122
117
125
120
126
121
import tables
127
122
128
- from orderedset import OrderedSet
129
123
from utils import prod , table2str , table2csv , table2iode , timed , unique
130
124
131
125
@@ -888,8 +882,6 @@ def to_excel(self, filename, sep=None):
888
882
for row , data in enumerate (np .asarray (self )):
889
883
worksheet .write_row (1 + row , 1 , data )
890
884
891
-
892
-
893
885
def transpose (self , * args ):
894
886
axes_names = set (axis .name for axis in args )
895
887
missing_axes = [axis for axis in self .axes
@@ -921,7 +913,7 @@ def ToAv(self, filename):
921
913
922
914
def parse (s ):
923
915
#parameters can be strings or numbers
924
- if ( isinstance (s , str ) ):
916
+ if isinstance (s , str ):
925
917
s = s .lower ()
926
918
if s in ('0' , '1' , 'false' , 'true' ):
927
919
return s in ('1' , 'true' )
@@ -935,6 +927,7 @@ def parse(s):
935
927
else :
936
928
return s
937
929
930
+
938
931
def df_aslarray (df , na = np .nan ):
939
932
axes_labels = [list (unique (level [labels ]))
940
933
for level , labels in zip (df .index .levels , df .index .labels )]
@@ -969,35 +962,28 @@ def read_csv(filepath, index_col, sep=',', na=np.nan):
969
962
# axes_names)
970
963
df = pd .read_csv (filepath , index_col = index_col , sep = sep )
971
964
return df_aslarray (df .reindex_axis (sorted (df .columns ), axis = 1 ), na )
972
-
965
+
966
+
973
967
def save_csv (l_array , filepath , sep = ',' , na = np .nan ):
974
968
df = l_array .as_dataframe ()
975
969
df .to_csv (filepath , sep = sep )
976
970
977
- # HDF5 functions
971
+
972
+ # HDF5 functions
978
973
def save_h5 (l_array , name , filepath ):
979
974
df = l_array .as_dataframe ()
980
975
store = pd .HDFStore (filepath )
981
976
store .put (name , df )
982
977
store .close ()
983
978
979
+
984
980
def read_h5 (name , filepath ):
985
981
store = pd .HDFStore (filepath )
986
982
df = store .get (name )
987
983
store .close ()
988
- return df_aslarray (df )
984
+ return df_aslarray (df )
985
+
989
986
990
- #EXCEL functions
991
- def save_excel (l_array , name , filepath ):
992
- df = l_array .as_dataframe ()
993
- writer = pd .ExcelWriter (filepath )
994
- df .to_excel (writer , name )
995
- writer .save ()
996
-
997
- def read_excel (name , filepath , index_col ):
998
- df = pd .read_excel (filepath , name , index_col = index_col )
999
- return df_aslarray (df .reindex_axis (sorted (df .columns ), axis = 1 ))
1000
-
1001
987
def SaveMatrices (h5_filename ):
1002
988
try :
1003
989
h5file = tables .openFile (h5_filename , mode = "w" , title = "IodeMatrix" )
@@ -1053,6 +1039,20 @@ def LoadMatrix(h5_filename, matname):
1053
1039
finally :
1054
1040
h5file .close ()
1055
1041
1042
+
1043
+ # EXCEL functions
1044
+ def save_excel (l_array , name , filepath ):
1045
+ df = l_array .as_dataframe ()
1046
+ writer = pd .ExcelWriter (filepath )
1047
+ df .to_excel (writer , name )
1048
+ writer .save ()
1049
+
1050
+
1051
+ def read_excel (name , filepath , index_col ):
1052
+ df = pd .read_excel (filepath , name , index_col = index_col )
1053
+ return df_aslarray (df .reindex_axis (sorted (df .columns ), axis = 1 ))
1054
+
1055
+
1056
1056
if __name__ == '__main__' :
1057
1057
#reg.Collapse('c:/tmp/reg.csv')
1058
1058
#reg.ToAv('reg.av')
0 commit comments