|
| 1 | +""" |
| 2 | +Import historical stock data from yahoo finance. |
| 3 | +""" |
| 4 | + |
| 5 | +import argparse |
| 6 | +from datetime import datetime |
| 7 | +import predictionio |
| 8 | +import pytz |
| 9 | +import time |
| 10 | +from pandas.io import data as pdata |
| 11 | +import numpy |
| 12 | + |
| 13 | +EPOCH = datetime(1970, 1, 1, tzinfo=pytz.utc) |
| 14 | + |
| 15 | +SP500_LIST = [ |
| 16 | + "A", "AA", "AAPL", "ABBV", "ABC", "ABT", "ACE", "ACN", "ACT", "ADBE", "ADI", |
| 17 | + "ADM", "ADP", "ADS", "ADSK", "ADT", "AEE", "AEP", "AES", "AET", "AFL", |
| 18 | + "AGN", "AIG", "AIV", "AIZ", "AKAM", "ALL", "ALLE", "ALTR", "ALXN", "AMAT", |
| 19 | + "AME", "AMGN", "AMP", "AMT", "AMZN", "AN", "AON", "APA", "APC", "APD", |
| 20 | + "APH", "ARG", "ATI", "AVB", "AVP", "AVY", "AXP", "AZO", "BA", "BAC", "BAX", |
| 21 | + "BBBY", "BBT", "BBY", "BCR", "BDX", "BEAM", "BEN", "BF-B", "BHI", "BIIB", |
| 22 | + "BK", "BLK", "BLL", "BMS", "BMY", "BRCM", "BRK-B", "BSX", "BTU", "BWA", |
| 23 | + "BXP", "C", "CA", "CAG", "CAH", "CAM", "CAT", "CB", "CBG", "CBS", "CCE", |
| 24 | + "CCI", "CCL", "CELG", "CERN", "CF", "CFN", "CHK", "CHRW", "CI", "CINF", |
| 25 | + "CL", "CLX", "CMA", "CMCSA", "CME", "CMG", "CMI", "CMS", "CNP", "CNX", |
| 26 | + "COF", "COG", "COH", "COL", "COP", "COST", "COV", "CPB", "CRM", "CSC", |
| 27 | + "CSCO", "CSX", "CTAS", "CTL", "CTSH", "CTXS", "CVC", "CVS", "CVX", "D", |
| 28 | + "DAL", "DD", "DE", "DFS", "DG", "DGX", "DHI", "DHR", "DIS", "DISCA", "DLPH", |
| 29 | + "DLTR", "DNB", "DNR", "DO", "DOV", "DOW", "DPS", "DRI", "DTE", "DTV", "DUK", |
| 30 | + "DVA", "DVN", "EA", "EBAY", "ECL", "ED", "EFX", "EIX", "EL", "EMC", "EMN", |
| 31 | + "EMR", "EOG", "EQR", "EQT", "ESRX", "ESS", "ESV", "ETFC", "ETN", "ETR", |
| 32 | + "EW", "EXC", "EXPD", "EXPE", "F", "FAST", "FB", "FCX", "FDO", "FDX", "FE", |
| 33 | + "FFIV", "FIS", "FISV", "FITB", "FLIR", "FLR", "FLS", "FMC", "FOSL", "FOXA", |
| 34 | + "FRX", "FSLR", "FTI", "FTR", "GAS", "GCI", "GD", "GE", "GGP", "GHC", "GILD", |
| 35 | + "GIS", "GLW", "GM", "GMCR", "GME", "GNW", "GOOG", "GOOGL", "GPC", "GPS", |
| 36 | + "GRMN", "GS", "GT", "GWW", "HAL", "HAR", "HAS", "HBAN", "HCBK", "HCN", |
| 37 | + "HCP", "HD", "HES", "HIG", "HOG", "HON", "HOT", "HP", "HPQ", "HRB", "HRL", |
| 38 | + "HRS", "HSP", "HST", "HSY", "HUM", "IBM", "ICE", "IFF", "IGT", "INTC", |
| 39 | + "INTU", "IP", "IPG", "IR", "IRM", "ISRG", "ITW", "IVZ", "JBL", "JCI", "JEC", |
| 40 | + "JNJ", "JNPR", "JOY", "JPM", "JWN", "K", "KEY", "KIM", "KLAC", "KMB", "KMI", |
| 41 | + "KMX", "KO", "KORS", "KR", "KRFT", "KSS", "KSU", "L", "LB", "LEG", "LEN", |
| 42 | + "LH", "LLL", "LLTC", "LLY", "LM", "LMT", "LNC", "LO", "LOW", "LRCX", "LSI", |
| 43 | + "LUK", "LUV", "LYB", "M", "MA", "MAC", "MAR", "MAS", "MAT", "MCD", "MCHP", |
| 44 | + "MCK", "MCO", "MDLZ", "MDT", "MET", "MHFI", "MHK", "MJN", "MKC", "MMC", |
| 45 | + "MMM", "MNST", "MO", "MON", "MOS", "MPC", "MRK", "MRO", "MS", "MSFT", "MSI", |
| 46 | + "MTB", "MU", "MUR", "MWV", "MYL", "NBL", "NBR", "NDAQ", "NE", "NEE", "NEM", |
| 47 | + "NFLX", "NFX", "NI", "NKE", "NLSN", "NOC", "NOV", "NRG", "NSC", "NTAP", |
| 48 | + "NTRS", "NU", "NUE", "NVDA", "NWL", "NWSA", "OI", "OKE", "OMC", "ORCL", |
| 49 | + "ORLY", "OXY", "PAYX", "PBCT", "PBI", "PCAR", "PCG", "PCL", "PCLN", "PCP", |
| 50 | + "PDCO", "PEG", "PEP", "PETM", "PFE", "PFG", "PG", "PGR", "PH", "PHM", "PKI", |
| 51 | + "PLD", "PLL", "PM", "PNC", "PNR", "PNW", "POM", "PPG", "PPL", "PRGO", "PRU", |
| 52 | + "PSA", "PSX", "PVH", "PWR", "PX", "PXD", "QCOM", "QEP", "R", "RAI", "RDC", |
| 53 | + "REGN", "RF", "RHI", "RHT", "RIG", "RL", "ROK", "ROP", "ROST", "RRC", "RSG", |
| 54 | + "RTN", "SBUX", "SCG", "SCHW", "SE", "SEE", "SHW", "SIAL", "SJM", "SLB", |
| 55 | + "SLM", "SNA", "SNDK", "SNI", "SO", "SPG", "SPLS", "SRCL", "SRE", "STI", |
| 56 | + "STJ", "STT", "STX", "STZ", "SWK", "SWN", "SWY", "SYK", "SYMC", "SYY", "T", |
| 57 | + "TAP", "TDC", "TE", "TEG", "TEL", "TGT", "THC", "TIF", "TJX", "TMK", "TMO", |
| 58 | + "TRIP", "TROW", "TRV", "TSCO", "TSN", "TSO", "TSS", "TWC", "TWX", "TXN", |
| 59 | + "TXT", "TYC", "UNH", "UNM", "UNP", "UPS", "URBN", "USB", "UTX", "V", "VAR", |
| 60 | + "VFC", "VIAB", "VLO", "VMC", "VNO", "VRSN", "VRTX", "VTR", "VZ", "WAG", |
| 61 | + "WAT", "WDC", "WEC", "WFC", "WFM", "WHR", "WIN", "WLP", "WM", "WMB", "WMT", |
| 62 | + "WU", "WY", "WYN", "WYNN", "X", "XEL", "XL", "XLNX", "XOM", "XRAY", "XRX", |
| 63 | + "XYL", "YHOO", "YUM", "ZION", "ZMH", "ZTS"] |
| 64 | + |
| 65 | +ETF_LIST = ["QQQ", "SPY", "XLY", "XLP", "XLE", "XLF", "XLV", |
| 66 | + "XLI", "XLB", "XLK", "XLU"] |
| 67 | + |
| 68 | + |
| 69 | +def since_epoch(dt): |
| 70 | + return (dt - EPOCH).total_seconds() |
| 71 | + |
| 72 | + |
| 73 | +def import_data(client, app_id, ticker, start_time, end_time, event_time): |
| 74 | + print "Importing:", ticker, start_time, end_time |
| 75 | + |
| 76 | + try: |
| 77 | + df = pdata.DataReader(ticker, 'yahoo', start_time, end_time) |
| 78 | + print "Extracted:", df.index[0], df.index[-1] |
| 79 | + except IOError, ex: |
| 80 | + print ex |
| 81 | + print "Data not exist. Returning" |
| 82 | + return |
| 83 | + |
| 84 | + # assume we only extract US data |
| 85 | + eastern = pytz.timezone('US/Eastern') |
| 86 | + |
| 87 | + columns = [ |
| 88 | + ('Open', 'open'), |
| 89 | + ('High', 'high'), |
| 90 | + ('Low', 'low'), |
| 91 | + ('Close', 'close'), |
| 92 | + ('Volume', 'volume'), |
| 93 | + ('Adj Close', 'adjclose')] |
| 94 | + |
| 95 | + yahoo_data = dict() |
| 96 | + yahoo_data['ticker'] = ticker |
| 97 | + yahoo_data['t'] = [ |
| 98 | + # hour=16 to indicate market close time |
| 99 | + since_epoch(eastern.localize(date_.to_pydatetime().replace(hour=16))) |
| 100 | + for date_ in df.index] |
| 101 | + |
| 102 | + for column in columns: |
| 103 | + yahoo_data[column[1]] = map(numpy.asscalar, df[column[0]].values) |
| 104 | + |
| 105 | + properties = {'yahoo': yahoo_data} |
| 106 | + |
| 107 | + data = { |
| 108 | + 'event': '$set', |
| 109 | + 'entityType': 'yahoo', |
| 110 | + 'entityId': ticker, |
| 111 | + 'properties': properties, |
| 112 | + 'appId': app_id, |
| 113 | + 'eventTime': event_time.replace(tzinfo=pytz.utc).isoformat(), |
| 114 | + } |
| 115 | + |
| 116 | + response = client.create_event(data) |
| 117 | + print(response) |
| 118 | + |
| 119 | + |
| 120 | +def import_all(app_id): |
| 121 | + """This method import all SP500 stocks and some SPDR ETFs.""" |
| 122 | + time_slices = [ |
| 123 | + (datetime(1999, 1, 1), datetime(2004, 1, 1), datetime(2004, 1, 2)), |
| 124 | + (datetime(2003, 12, 1), datetime(2009, 1, 1), datetime(2009, 1, 2)), |
| 125 | + (datetime(2008, 12, 1), datetime(2014, 9, 1), datetime(2014, 9, 2)), |
| 126 | + ] |
| 127 | + |
| 128 | + url = 'http://localhost:7070' |
| 129 | + client = predictionio.EventClient(app_id=app_id, threads=1, url=url) |
| 130 | + |
| 131 | + tickers = SP500_LIST + ETF_LIST |
| 132 | + |
| 133 | + for ticker in tickers: |
| 134 | + for time_slice in time_slices: |
| 135 | + import_data(client, app_id, ticker, |
| 136 | + time_slice[0], time_slice[1], time_slice[2]) |
| 137 | + |
| 138 | + |
| 139 | +def import_data_with_gaps(app_id): |
| 140 | + """This method import data with time gaps. |
| 141 | + |
| 142 | + Data imported by this method is used by stock engine, it demonsrates how it |
| 143 | + can handle time series data with gaps. |
| 144 | + """ |
| 145 | + |
| 146 | + # time_slices is discontinuted |
| 147 | + # startTime, endTime, eventDate |
| 148 | + time_slices = [ |
| 149 | + (datetime(2013, 12, 1), datetime(2014, 2, 1), datetime(2014, 2, 2)), |
| 150 | + (datetime(2014, 1, 1), datetime(2014, 1, 20), datetime(2014, 2, 10)), |
| 151 | + (datetime(2014, 1, 10), datetime(2014, 2, 20), datetime(2014, 2, 28)), |
| 152 | + (datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)), |
| 153 | + (datetime(2014, 5, 1), datetime(2014, 6, 15), datetime(2014, 6, 20)), |
| 154 | + (datetime(2014, 6, 1), datetime(2014, 7, 1), datetime(2014, 7, 15)), |
| 155 | + ] |
| 156 | + |
| 157 | + tickers = ['SPY', 'AAPL', 'IBM', 'MSFT'] |
| 158 | + |
| 159 | + url = 'http://localhost:7070' |
| 160 | + client = predictionio.EventClient(app_id=app_id, threads=1, url=url) |
| 161 | + |
| 162 | + for ticker in tickers: |
| 163 | + for time_slice in time_slices: |
| 164 | + import_data(client, app_id, ticker, |
| 165 | + time_slice[0], time_slice[1], time_slice[2]) |
| 166 | + |
| 167 | + # below are data with holes |
| 168 | + time_slices = [ |
| 169 | + (datetime(2014, 1, 1), datetime(2014, 1, 20), datetime(2014, 2, 10)), |
| 170 | + (datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)), |
| 171 | + (datetime(2014, 6, 1), datetime(2014, 7, 1), datetime(2014, 7, 15)), |
| 172 | + ] |
| 173 | + |
| 174 | + tickers = ['AMZN'] |
| 175 | + for ticker in tickers: |
| 176 | + for time_slice in time_slices: |
| 177 | + import_data(client, app_id, ticker, |
| 178 | + time_slice[0], time_slice[1], time_slice[2]) |
| 179 | + |
| 180 | + time_slices = [ |
| 181 | + (datetime(2014, 1, 10), datetime(2014, 2, 20), datetime(2014, 2, 28)), |
| 182 | + (datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)), |
| 183 | + ] |
| 184 | + tickers = ['FB'] |
| 185 | + for ticker in tickers: |
| 186 | + for time_slice in time_slices: |
| 187 | + import_data(client, app_id, ticker, |
| 188 | + time_slice[0], time_slice[1], time_slice[2]) |
| 189 | + |
| 190 | + |
| 191 | +def import_one(app_id): |
| 192 | + """Import TSLA. |
| 193 | + |
| 194 | + Import data with from 2014-01-01 until 2014-03-01. event_time specifies when |
| 195 | + this data is extracted. |
| 196 | + """ |
| 197 | + start_time = datetime(2014, 1, 1) |
| 198 | + end_time = datetime(2014, 3, 1) |
| 199 | + event_time = datetime(2014, 9, 1) |
| 200 | + ticker = 'TSLA' |
| 201 | + |
| 202 | + url = 'http://localhost:7070' |
| 203 | + client = predictionio.EventClient(app_id=app_id, threads=1, url=url) |
| 204 | + |
| 205 | + import_data(client, app_id, ticker, start_time, end_time, event_time) |
| 206 | + |
| 207 | + |
| 208 | +if __name__ == '__main__': |
| 209 | + #import_all(app_id=2) |
| 210 | + #import_data_with_gaps(app_id=1) |
| 211 | + import_one(app_id=1) |
0 commit comments