|
| 1 | +# This is a workaround snippet for reading very large CSV that exceed the |
| 2 | +# machine's memory and dump it into an SQLite database using pandas. |
| 3 | +# |
| 4 | +# Sebastian Raschka, 2015 |
| 5 | +# |
| 6 | +# Tested in Python 3.4.2 and pandas 0.15.2 |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +import sqlite3 |
| 10 | +from pandas.io import sql |
| 11 | +import subprocess |
| 12 | + |
| 13 | +# In and output file paths |
| 14 | +in_csv = '../data/my_large.csv' |
| 15 | +out_sqlite = '../data/my.sqlite' |
| 16 | + |
| 17 | +table_name = 'my_table' # name for the SQLite database table |
| 18 | +chunksize = 100000 # number of lines to process at each iteration |
| 19 | + |
| 20 | +# Get number of lines in the CSV file |
| 21 | +nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True) |
| 22 | +nlines = int(nlines.split()[0]) |
| 23 | + |
| 24 | +# connect to database |
| 25 | +cnx = sqlite3.connect(out_sqlite) |
| 26 | + |
| 27 | +# Iteratively read CSV and dump lines into the SQLite table |
| 28 | +for i in range(0, nlines, chunksize): |
| 29 | + |
| 30 | + df = pd.read_csv(in_csv, |
| 31 | + header=None, # no header, define column header manually later |
| 32 | + nrows=chunksize, # number of rows to read at each iteration |
| 33 | + skiprows=i) # skip rows that were already read |
| 34 | + |
| 35 | + # columns to read |
| 36 | + df.columns = ['molecule_id','charge','db','drugsnow','hba','hbd','loc','nrb','smiles'] |
| 37 | + |
| 38 | + sql.to_sql(df, |
| 39 | + name=table_name, |
| 40 | + con=cnx, |
| 41 | + index=False, # don't use CSV file index |
| 42 | + index_label='molecule_id', # use a unique column from DataFrame as index |
| 43 | + if_exists='append') |
| 44 | +cnx.close() |
| 45 | + |
0 commit comments