113
113
# ? improve Labeler I don't know how though :)
114
114
# ? implement dict-like behavior for LArray.axes (to be able to do stuff like
115
115
# la.axes['sex'].labels
116
+ #
116
117
117
118
import csv
118
119
from itertools import izip , product , chain
119
120
import string
120
121
import sys
121
122
122
123
import numpy as np
124
+ import pandas as pd
125
+
123
126
import tables
124
127
125
128
from orderedset import OrderedSet
@@ -518,6 +521,17 @@ def __new__(cls, data, axes=None):
518
521
axes = list (axes )
519
522
obj .axes = axes
520
523
return obj
524
+
525
+ def as_dataframe (self ):
526
+ axes_labels = [a .labels .tolist () for a in self .axes [:- 1 ]]
527
+ axes_names = [a .name for a in self .axes [:- 1 ]]
528
+ axes_names [- 1 ] = axes_names [- 1 ] + '\\ ' + self .axes [- 1 ].name
529
+ columns = self .axes [- 1 ].labels .tolist ()
530
+ full_index = [i for i in product (* axes_labels )]
531
+ index = pd .MultiIndex .from_tuples (full_index , names = axes_names )
532
+ df = pd .DataFrame (self .reshape (len (full_index ), len (columns )), index , columns )
533
+ return df
534
+
521
535
522
536
#noinspection PyAttributeOutsideInit
523
537
def __array_finalize__ (self , obj ):
@@ -644,39 +658,27 @@ def __str__(self):
644
658
def as_table (self ):
645
659
if not self .ndim :
646
660
return []
661
+
662
+ #ert | unit | geo\time | 2012 | 2011 | 2010
663
+ #NEER27 | I05 | AT | 101.41 | 101.63 | 101.63
664
+ #NEER27 | I05 | AU | 134.86 | 125.29 | 117.08
647
665
648
- # gender | |
649
- # False | True | total
650
- # 20 | 16 | 35
651
-
652
- # dead | gender | |
653
- # | False | True | total
654
- # False | 20 | 15 | 35
655
- # True | 0 | 1 | 1
656
- # total | 20 | 16 | 36
657
-
658
- # agegroup | gender | dead | |
659
- # | | False | True | total
660
- # 5 | False | 20 | 15 | xx
661
- # 5 | True | 0 | 1 | xx
662
- # 10 | False | 25 | 10 | xx
663
- # 10 | True | 1 | 1 | xx
664
- # | total | xx | xx | xx
665
666
width = self .shape [- 1 ]
666
667
height = prod (self .shape [:- 1 ])
667
668
if self .axes is not None :
669
+ #axes_names = [axis.name for axis in self.axes]
668
670
axes_names = [axis .name for axis in self .axes ]
671
+ if len (axes_names ) > 1 :
672
+ axes_names [- 2 ] = '\\ ' .join (axes_names [- 2 :])
673
+ axes_names .pop ()
674
+
669
675
axes_labels = [axis .labels for axis in self .axes ]
670
676
else :
671
677
axes_names = None
672
678
axes_labels = None
673
679
674
680
if axes_names is not None :
675
- result = [axes_names +
676
- ['' ] * (width - 1 ),
677
- # 2nd line
678
- ['' ] * (self .ndim - 1 ) +
679
- list (axes_labels [- 1 ])]
681
+ result = [axes_names + list (axes_labels [- 1 ])]
680
682
#if self.row_totals is not None:
681
683
# result[0].append('')
682
684
# result[1].append('total')
@@ -854,58 +856,39 @@ def sum(self, *args, **kwargs):
854
856
855
857
#XXX: sep argument does not seem very useful
856
858
def to_excel (self , filename , sep = None ):
857
- import ExcelCom as ec
859
+ import xlsxwriter as xl
858
860
859
861
if sep is None :
860
862
sep = '_'
861
863
#sep = self.sep
862
- xl = ec .comExcel ()
863
- xl .load (filename )
864
+ workbook = xl .Workbook (filename )
864
865
if self .ndim > 2 :
865
866
for key in product (* [axis .labels for axis in self .axes [:- 2 ]]):
866
867
sheetname = sep .join (str (k ) for k in key )
867
-
868
868
# sheet names must not:
869
869
# * contain any of the following characters: : \ / ? * [ ]
870
870
#XXX: this will NOT work for unicode strings !
871
871
sheetname = sheetname .translate (string .maketrans ('[:]' , '(-)' ),
872
872
r'\/?*' ) # chars to delete
873
873
# * exceed 31 characters
874
- sheetname = sheetname [:31 ]
874
+ # sheetname = sheetname[:31]
875
875
# * be blank
876
876
assert sheetname , "sheet name cannot be blank"
877
-
878
- sheetdata = np .asarray (self [key ])
879
-
880
- xl .addworksheets (sheetname )
881
-
882
- #TODO: reuse as_table, possibly adding an argument to
883
- # as_table to determine how many dimensions should be "folded"
884
-
885
- # last axis (time) as columns headers (ie the first row)
886
- xl .setRange (sheetname , 2 , 1 ,
887
- (tuple (str (l ) for l in self .axes [- 1 ].labels ),))
888
-
889
- # next to last axis as rows headers (ie the first column)
890
- xl .setRange (sheetname , 1 , 2 ,
891
- tuple ((x ,) for x in self .axes [- 2 ].labels ))
892
- xl .setRange (sheetname , 2 , 2 , sheetdata )
877
+ worksheet = workbook .add_worksheet (sheetname )
878
+ worksheet .write_row (0 , 1 , self .axes [- 1 ].labels )
879
+ worksheet .write_column (1 , 0 , self .axes [- 2 ].labels )
880
+ for row , data in enumerate (np .asarray (self [key ])):
881
+ worksheet .write_row (1 + row , 1 , data )
882
+
893
883
else :
894
- xl .addworksheets ('Sheet1' )
895
-
896
- # last axis (time) as columns headers (ie the first row)
897
- xl .setRange ('Sheet1' , 2 , 1 ,
898
- (tuple (str (l ) for l in self .axes [- 1 ].labels ),))
884
+ worksheet = workbook .add_worksheet ('Sheet1' )
885
+ worksheet .write_row (0 , 1 , self .axes [- 1 ].labels )
899
886
if self .ndim == 2 :
900
- # next to last axis as rows headers (ie the first column )
901
- xl . setRange ( 'Sheet1' , 1 , 2 ,
902
- tuple (( str ( x ),) for x in self . axes [ - 2 ]. labels ))
903
- xl . setRange ( 'Sheet1' , 2 , 2 , np . asarray ( self ))
887
+ worksheet . write_column ( 1 , 0 , self . axes [ - 2 ]. labels )
888
+ for row , data in enumerate ( np . asarray ( self )):
889
+ worksheet . write_row ( 1 + row , 1 , data )
890
+
904
891
905
- xl .save (filename )
906
- xl .close ()
907
- xl .end
908
- del xl
909
892
910
893
def transpose (self , * args ):
911
894
axes_names = set (axis .name for axis in args )
@@ -937,44 +920,84 @@ def ToAv(self, filename):
937
920
938
921
939
922
def parse (s ):
940
- s = s .lower ()
941
- if s in ('0' , '1' , 'false' , 'true' ):
942
- return s in ('1' , 'true' )
943
- elif s .isdigit ():
944
- return int (s )
923
+ #parameters can be strings or numbers
924
+ if (isinstance (s , str )):
925
+ s = s .lower ()
926
+ if s in ('0' , '1' , 'false' , 'true' ):
927
+ return s in ('1' , 'true' )
928
+ elif s .isdigit ():
929
+ return int (s )
930
+ else :
931
+ try :
932
+ return float (s )
933
+ except ValueError :
934
+ return s
945
935
else :
946
- try :
947
- return float (s )
948
- except ValueError :
949
- return s
950
-
951
-
952
- def read_csv (filepath ):
953
- import pandas as pd
936
+ return s
954
937
955
- # read the first line to determine how many axes (time excluded) we have
956
- with open (filepath , 'rb' ) as f :
957
- reader = csv .reader (f )
958
- header = [parse (cell ) for cell in reader .next ()]
959
- axes_names = [cell for cell in header if isinstance (cell , basestring )]
960
- df = pd .read_csv (filepath , index_col = range (len (axes_names )))
961
- assert df .index .names == axes_names , "%s != %s" % (df .index .names ,
962
- axes_names )
963
-
964
- # labels in index.levels are sorted, but the data is not, so we need to
965
- # compute the "unsorted" labels !
966
- # alternatives are to either use "df = df.sort_index()", or
967
- # "df.index.get_level_values(level)" but they are both slower.
938
+ def df_aslarray (df , na = np .nan ):
968
939
axes_labels = [list (unique (level [labels ]))
969
940
for level , labels in zip (df .index .levels , df .index .labels )]
941
+ axes_names = df .index .names
942
+ laxis = axes_names [- 1 ].split ('\\ ' )
943
+ if len (laxis ) > 0 :
944
+ axes_names [- 1 ] = laxis [0 ]
970
945
axes = [Axis (name , labels ) for name , labels in zip (axes_names , axes_labels )]
971
946
# pandas treats the "time" labels as column names (strings) so we need to
972
947
# convert them to values
973
- axes .append (Axis ('time' , [parse (cell ) for cell in df .columns .values ]))
974
- data = df .values .reshape ([len (axis .labels ) for axis in axes ])
975
- return LArray (data , axes )
948
+ if len (laxis ) > 0 :
949
+ axes_names [- 1 ] = laxis [0 ]
950
+ axes .append (Axis (laxis [1 ], [parse (cell ) for cell in df .columns .values ]))
951
+ else :
952
+ axes .append (Axis ('time' , [parse (cell ) for cell in df .columns .values ]))
953
+ sdf = df .reindex ([i for i in product (* axes_labels )], df .columns .values )
954
+ if na != np .nan :
955
+ sdf .fillna (na ,inplace = True )
956
+ data = sdf .values .reshape ([len (axis .labels ) for axis in axes ])
957
+ return LArray (data , axes )
976
958
977
959
960
+ # CSV functions
961
+ def read_csv (filepath , index_col , sep = ',' , na = np .nan ):
962
+ # read the first line to determine how many axes (time excluded) we have
963
+ # with open(filepath, 'rb') as f:
964
+ # reader = csv.reader(f, delimiter=sep)
965
+ # header = [parse(cell) for cell in reader.next()]
966
+ # axes_names = [cell for cell in header if isinstance(cell, basestring)]
967
+ # df = pd.read_csv(filepath, index_col=range(len(axes_names)), sep=sep)
968
+ # assert df.index.names == axes_names, "%s != %s" % (df.index.names,
969
+ # axes_names)
970
+ df = pd .read_csv (filepath , index_col = index_col , sep = sep )
971
+ return df_aslarray (df .reindex_axis (sorted (df .columns ), axis = 1 ), na )
972
+
973
+ def save_csv (l_array , filepath , sep = ',' , na = np .nan ):
974
+ df = l_array .as_dataframe ()
975
+ df .to_csv (filepath , sep = sep )
976
+
977
+ # HDF5 functions
978
+ def save_h5 (l_array , name , filepath ):
979
+ df = l_array .as_dataframe ()
980
+ store = pd .HDFStore (filepath )
981
+ store .put (name , df )
982
+ store .close ()
983
+
984
+ def read_h5 (name , filepath ):
985
+ store = pd .HDFStore (filepath )
986
+ df = store .get (name )
987
+ store .close ()
988
+ return df_aslarray (df )
989
+
990
+ #EXCEL functions
991
+ def save_excel (l_array , name , filepath ):
992
+ df = l_array .as_dataframe ()
993
+ writer = pd .ExcelWriter (filepath )
994
+ df .to_excel (writer , name )
995
+ writer .save ()
996
+
997
+ def read_excel (name , filepath , index_col ):
998
+ df = pd .read_excel (filepath , name , index_col = index_col )
999
+ return df_aslarray (df .reindex_axis (sorted (df .columns ), axis = 1 ))
1000
+
978
1001
def SaveMatrices (h5_filename ):
979
1002
try :
980
1003
h5file = tables .openFile (h5_filename , mode = "w" , title = "IodeMatrix" )
@@ -1033,6 +1056,13 @@ def LoadMatrix(h5_filename, matname):
1033
1056
if __name__ == '__main__' :
1034
1057
#reg.Collapse('c:/tmp/reg.csv')
1035
1058
#reg.ToAv('reg.av')
1036
- pass
1037
-
1038
-
1059
+ bel = read_csv ('bel.csv' , index_col = [0 ,1 ,2 ,3 ])
1060
+ test = read_csv ('ert_eff_ic_a.tsv' , index_col = [0 ,1 ,2 ], sep = '\t ' , na = 0 )
1061
+ test .ToCsv ('brol.csv' )
1062
+ save_csv (test , 'brolpd.csv' )
1063
+ test_csv = read_csv ('brolpd.csv' , index_col = [0 ,1 ,2 ])
1064
+ save_excel (test , "TEST" , "test.xls" )
1065
+ test_xls = read_excel ("TEST" , "test.xls" , index_col = [0 ,1 ,2 ])
1066
+ save_h5 (test , 'test' , 'store.h5' )
1067
+ test_h5 = read_h5 ('test' , 'store.h5' )
1068
+ save_h5 (bel , 'bel' , 'store.h5' )
0 commit comments