added large_csv_to_sqlite.py

rasbt · rasbt · commit 99f48330a091 · 2015-02-06T11:52:10.000-05:00
diff --git a/useful_scripts/large_csv_to_sqlite.py b/useful_scripts/large_csv_to_sqlite.py
@@ -0,0 +1,45 @@
+# This is a workaround snippet for reading very large CSV that exceed the
+# machine's memory and dump it into an SQLite database using pandas.
+#
+# Sebastian Raschka, 2015
+#
+# Tested in Python 3.4.2 and pandas 0.15.2 
+
+import pandas as pd
+import sqlite3
+from pandas.io import sql
+import subprocess
+
+# In and output file paths
+in_csv = '../data/my_large.csv'
+out_sqlite = '../data/my.sqlite'
+
+table_name = 'my_table' # name for the SQLite database table
+chunksize = 100000 # number of lines to process at each iteration
+
+# Get number of lines in the CSV file
+nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True)
+nlines = int(nlines.split()[0]) 
+
+# connect to database
+cnx = sqlite3.connect(out_sqlite)
+
+# Iteratively read CSV and dump lines into the SQLite table
+for i in range(0, nlines, chunksize):
+    
+    df = pd.read_csv(in_csv,  
+            header=None,  # no header, define column header manually later
+            nrows=chunksize, # number of rows to read at each iteration
+            skiprows=i)   # skip rows that were already read
+    
+    # columns to read        
+    df.columns = ['molecule_id','charge','db','drugsnow','hba','hbd','loc','nrb','smiles']
+
+    sql.to_sql(df, 
+                name=table_name, 
+                con=cnx, 
+                index=False, # don't use CSV file index
+                index_label='molecule_id', # use a unique column from DataFrame as index
+                if_exists='append') 
+cnx.close()    
+