added read/load csv,h5,excel functions using pandas dataframe methods

gbryon · gbryon · commit 8b57539caa1e · 2014-02-26T17:19:08.000+01:00
diff --git a/larray.py b/larray.py
@@ -75,6 +75,8 @@
 import sys
 
 import numpy as np
+import pandas as pd
+
 import tables
 
 from utils import prod, table2str, table2csv, table2iode, timed, unique
@@ -292,6 +294,17 @@ def __new__(cls, data, axes=None):
             axes = list(axes)
         obj.axes = axes
         return obj
+    
+    def as_dataframe(self):
+        axes_labels = [a.labels.tolist() for a in self.axes[:-1]]
+        axes_names = [a.name for a in self.axes[:-1]]
+        axes_names[-1] = axes_names[-1] + '\\' + self.axes[-1].name
+        columns = self.axes[-1].labels.tolist()
+        full_index=[i for i in product(*axes_labels)] 
+        index = pd.MultiIndex.from_tuples(full_index, names=axes_names)
+        df = pd.DataFrame(self.reshape(len(full_index), len(columns)), index, columns)
+        return df
+
 
     #noinspection PyAttributeOutsideInit
     def __array_finalize__(self, obj):
@@ -401,39 +414,27 @@ def __str__(self):
     def as_table(self):
         if not self.ndim:
             return []
+    
+        #ert	| unit	| geo\time	| 2012 	| 2011 	| 2010 	
+        #NEER27	| I05	| AT	| 101.41 	| 101.63 	| 101.63 	
+        #NEER27	| I05	| AU	| 134.86 	| 125.29 	| 117.08 	
 
-        # gender |      |
-        #  False | True | total
-        #     20 |   16 |    35
-
-        #   dead | gender |      |
-        #        |  False | True | total
-        #  False |     20 |   15 |    35
-        #   True |      0 |    1 |     1
-        #  total |     20 |   16 |    36
-
-        # agegroup | gender |  dead |      |
-        #          |        | False | True | total
-        #        5 |  False |    20 |   15 |    xx
-        #        5 |   True |     0 |    1 |    xx
-        #       10 |  False |    25 |   10 |    xx
-        #       10 |   True |     1 |    1 |    xx
-        #          |  total |    xx |   xx |    xx
         width = self.shape[-1]
         height = prod(self.shape[:-1])
         if self.axes is not None:
+            #axes_names = [axis.name for axis in self.axes]
             axes_names = [axis.name for axis in self.axes]
+            if len(axes_names) > 1:
+                axes_names[-2] = '\\'.join(axes_names[-2:])
+                axes_names.pop()
+                
             axes_labels = [axis.labels for axis in self.axes]
         else:
             axes_names = None
             axes_labels = None
 
         if axes_names is not None:
-            result = [axes_names +
-                      [''] * (width - 1),
-                      # 2nd line
-                      [''] * (self.ndim - 1) +
-                      list(axes_labels[-1])]
+            result = [axes_names + list(axes_labels[-1])]
             #if self.row_totals is not None:
             #    result[0].append('')
             #    result[1].append('total')
@@ -659,44 +660,82 @@ def ToAv(self, filename):
 
 
 def parse(s):
-    s = s.lower()
-    if s in ('0', '1', 'false', 'true'):
-        return s in ('1', 'true')
-    elif s.isdigit():
-        return int(s)
+    if(isinstance(s, str)):
+        s = s.lower()
+        if s in ('0', '1', 'false', 'true'):
+            return s in ('1', 'true')
+        elif s.isdigit():
+            return int(s)
+        else:
+            try:
+                return float(s)
+            except ValueError:
+                return s
     else:
-        try:
-            return float(s)
-        except ValueError:
-            return s
+        return s
 
+def df_aslarray(df, na=np.nan):
+    axes_labels = [list(unique(level[labels]))
+                   for level, labels in zip(df.index.levels, df.index.labels)]
+    axes_names = df.index.names
+    laxis = axes_names[-1].split('\\')                                                       
+    if len(laxis) > 0:
+        axes_names[-1] = laxis[0]
+    axes = [Axis(name, labels) for name, labels in zip(axes_names, axes_labels)]
+    # pandas treats the "time" labels as column names (strings) so we need to
+    # convert them to values
+    if len(laxis) > 0:
+        axes_names[-1] = laxis[0]
+        axes.append(Axis(laxis[1], [parse(cell) for cell in df.columns.values]))
+    else:
+        axes.append(Axis('time', [parse(cell) for cell in df.columns.values]))
+    sdf = df.reindex([i for i in product(*axes_labels)], df.columns.values)
+    if na != np.nan:
+        sdf.fillna(na,inplace=True)
+    data = sdf.values.reshape([len(axis.labels) for axis in axes])    
+    return LArray(data, axes) 
 
-def read_csv(filepath):
-    import pandas as pd
 
+# CSV functions
+def read_csv(filepath, sep=',', na=np.nan):  
     # read the first line to determine how many axes (time excluded) we have
     with open(filepath, 'rb') as f:
-        reader = csv.reader(f)
+        reader = csv.reader(f, delimiter=sep)
         header = [parse(cell) for cell in reader.next()]
         axes_names = [cell for cell in header if isinstance(cell, basestring)]
-    df = pd.read_csv(filepath, index_col=range(len(axes_names)))
+    df = pd.read_csv(filepath, index_col=range(len(axes_names)), sep=sep)
     assert df.index.names == axes_names, "%s != %s" % (df.index.names,
                                                        axes_names)
-
-    # labels in index.levels are sorted, but the data is not, so we need to
-    # compute the "unsorted" labels !
-    # alternatives are to either use "df = df.sort_index()", or
-    # "df.index.get_level_values(level)" but they are both slower.
-    axes_labels = [list(unique(level[labels]))
-                   for level, labels in zip(df.index.levels, df.index.labels)]
-    axes = [Axis(name, labels) for name, labels in zip(axes_names, axes_labels)]
-    # pandas treats the "time" labels as column names (strings) so we need to
-    # convert them to values
-    axes.append(Axis('time', [parse(cell) for cell in df.columns.values]))
-    data = df.values.reshape([len(axis.labels) for axis in axes])
-    return LArray(data, axes)
-
-
+    return df_aslarray(df, na)
+        
+def save_csv(l_array, filepath, sep=',', na=np.nan):
+    df = l_array.as_dataframe()
+    df.to_csv(filepath, sep=sep)
+
+# HDF5 functions    
+def save_h5(l_array, name, filepath):
+    df = l_array.as_dataframe()
+    store = pd.HDFStore(filepath)
+    store.put(name, df)
+    store.close()    
+    
+def read_h5(name, filepath):
+    store = pd.HDFStore(filepath)
+    df = store.get(name)
+    store.close()
+    return df_aslarray(df) 
+
+#EXCEL functions
+def save_excel(l_array, name, filepath):
+    df = l_array.as_dataframe()
+    writer = pd.ExcelWriter(filepath)
+    df.to_excel(writer, name)
+    writer.save()
+    
+def read_excel(name, filepath, index_col):
+    df=pd.read_excel(filepath, name, index_col=index_col)
+    return df_aslarray(df)     
+    
 def SaveMatrices(h5_filename):
     try:
         h5file = tables.openFile(h5_filename, mode="w", title="IodeMatrix")
@@ -755,4 +794,10 @@ def LoadMatrix(h5_filename, matname):
 if __name__ == '__main__':
     #reg.Collapse('c:/tmp/reg.csv')
     #reg.ToAv('reg.av')
-    pass
+    test = read_csv('ert_eff_ic_a.tsv', '\t', 0)
+    test.ToCsv('brol.csv')
+    save_csv(test, 'brolpd.csv')
+    save_excel(test, "TEST", "test.xls")
+    test_xls = read_excel("TEST", "test.xls", index_col=[0,1,2])
+    save_h5(test, 'test', 'store.h5')
+    test_h5 = read_h5('test', 'store.h5')
diff --git a/test_la.py b/test_la.py
@@ -4,10 +4,8 @@
 
 print "numpy", np.__version__
 
-vla = 'A11,A12,A13,A23,A24,A31,A32,A33,A34,A35,A36,A37,A38,A41,A42,A43,A44,' \
-      'A45,A46,A71,A72,A73' #.split(',')
-wal = 'A25,A51,A52,A53,A54,A55,A56,A57,A61,A62,A63,A64,A65,A81,A82,A83,A84,' \
-      'A85,A91,A92,A93' #.split(',')
+vla = 'A11,A12,A13,A23,A24,A31,A32,A33,A34,A35,A36,A37,A38,A41,A42,A43,A44,A45,A46,A71,A72,A73'.split(',')
+wal = 'A25,A51,A52,A53,A54,A55,A56,A57,A61,A62,A63,A64,A65,A81,A82,A83,A84,A85,A91,A92,A93'.split(',')
 #bru = ['A21']
 bru = 'A21'