Skip to content

Commit 1452f4f

Browse files
add
1 parent 879ccdc commit 1452f4f

File tree

28 files changed

+249
-4
lines changed

28 files changed

+249
-4
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
3+
def creatdataset():
4+
x=[0,1,2,3,4,5,6,7,8,9]
5+
y=[1,1,1,-1,-1,-1,1,1,1,-1]
6+
return x,y
7+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### adaboost
2+
3+
《统计学习方法》上面的例子
89.2 KB
Loading
1.4 MB
Binary file not shown.
1.4 MB
Binary file not shown.
1.48 MB
Binary file not shown.
333 KB
Binary file not shown.
1.57 MB
Binary file not shown.
1.15 MB
Binary file not shown.
1.56 MB
Binary file not shown.
1.56 MB
Binary file not shown.
1.15 MB
Binary file not shown.
1.57 MB
Binary file not shown.
329 KB
Binary file not shown.
1.41 MB
Binary file not shown.
1.47 MB
Binary file not shown.
1.5 MB
Binary file not shown.
1.55 MB
Binary file not shown.
1.37 MB
Binary file not shown.
1.36 MB
Binary file not shown.
1.55 MB
Binary file not shown.
1.5 MB
Binary file not shown.
1.4 MB
Binary file not shown.
517 KB
Binary file not shown.
101 KB
Binary file not shown.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import numpy as np
2+
import pickle
3+
import tensorflow as tf
4+
5+
abs_path='C:/Users/wwwa8/Documents/GitHub/Machine-Learning/序列预测/PCA去趋势化/dev/'
6+
7+
8+
#原始的data里面的数据格式是dataframe,arr改成了里面也是list
9+
def transfer(data):
10+
vol_col_index = 1 # 找到流量对应的列
11+
height = len(data)
12+
width = data[0].shape[0]
13+
14+
arr = np.zeros((height, width))
15+
for i in range(height):
16+
for j in range(width):
17+
arr[i,j]=data[i].iloc[j,vol_col_index]
18+
return arr
19+
20+
21+
def createdataset(data):
22+
dataset=[]
23+
for road in data: #对于某一条路的数据
24+
dataset.append(transfer(road))
25+
return dataset
26+
27+
def myload():
28+
filename ='dump.txt'
29+
f = open(abs_path+filename,'rb')
30+
data =pickle.load(f)
31+
f.close()
32+
#print (data) # 路段数 * 每个路段的信息(df的数据结构)
33+
return data
34+
35+
36+
def split_dataset(arr):
37+
trainX=[]
38+
trainY=[]
39+
trainX_len = 2 #使用3天预测一天
40+
trainY_len = 1 #使用3天预测一天
41+
day = 24*60
42+
merge_step = 3
43+
daylen =day/merge_step
44+
45+
days = arr.shape[1]/daylen #总天数
46+
for i in range(0,days-(trainX_len+trainY_len-1)):
47+
trainX.append(arr[ i*daylen :(i+trainX_len)*daylen] )
48+
trainY.append(arr[(i+trainX_len)*daylen:(i+trainX_len+trainY_len)*daylen])
49+
return trainX,trainY
50+
51+
52+
if __name__=="__main__":
53+
data = myload()
54+
#transfer
55+
dataset = createdataset(data) #dataset 的格式是 (路段 * 每一天 * 一天内的数据)
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import pandas as pd
2+
from datetime import datetime
3+
import os
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import pickle
7+
8+
def read_excel(filepath):
9+
df = pd.read_excel(filepath,skip_footer=1)
10+
df.drop_duplicates('last-update-time','first',inplace=True)
11+
# todo
12+
df.index = df['last-update-time']
13+
return df
14+
15+
def select_oneday(df,day):
16+
#print ("123")
17+
select_str = '2012-11-'+str(day)
18+
ret_df = df[select_str]
19+
return ret_df
20+
21+
def save(dfs,days):
22+
for df ,day in zip(dfs,days):
23+
df.to_csv(str(day)+'.csv')
24+
25+
def find_col_index(df,columns_name):
26+
for i in range(len(df.columns)):
27+
if df.columns[i]==columns_name:
28+
return i
29+
sys.exit("sorry, find_col_index can't find correct colnums_name .")
30+
31+
32+
def fill_df(result_df):
33+
speed_col_index = find_col_index(result_df[0],'speed')
34+
vol_col_index = find_col_index(result_df[0],'vol')
35+
36+
for df_index in range(len(result_df)):
37+
for i in range(result_df[df_index].shape[0]):
38+
if np.isnan(result_df[df_index].iloc[i,speed_col_index]):
39+
if df_index ==0: #从后面找
40+
find_index=df_index+1
41+
while find_index < len(result_df):
42+
if np.isnan(result_df[find_index].iloc[i,speed_col_index])==False:
43+
result_df[df_index].iloc[i,speed_col_index] = result_df[find_index].iloc[i,speed_col_index]
44+
result_df[df_index].iloc[i,vol_col_index] = result_df[find_index].iloc[i,vol_col_index]
45+
break
46+
find_index+=1
47+
else: #从前面找
48+
result_df[df_index].iloc[i,speed_col_index] = result_df[df_index-1].iloc[i,speed_col_index]
49+
result_df[df_index].iloc[i,vol_col_index] = result_df[df_index-1].iloc[i,vol_col_index]
50+
51+
def default_fill(result_df):
52+
for i in range(len(result_df)):
53+
result_df[i]=result_df[i].fillna(method='ffill')
54+
for i in range(len(result_df)):
55+
result_df[i]=result_df[i].fillna(method='bfill')
56+
return result_df
57+
58+
59+
# 把数据换成一天时间的点
60+
def generate_data_byday(df,day,begin_hour=0,end_hour=24):
61+
newdf = pd.DataFrame(columns=['road','vol','speed','last-update-time'])
62+
name = df['road'][0]
63+
date = '2012-11-'+str(day)
64+
for hour in range(begin_hour,end_hour):
65+
for minute in range(0,60):
66+
vol_item = np.nan
67+
speed_item = np.nan
68+
select_str = date+" "+str(hour)+":"+str(minute)
69+
if select_str in df.index:
70+
vol_item = df[select_str]['vol'][0]
71+
speed_item = df[select_str]['speed'][0]
72+
newdf.loc[newdf.shape[0]]=[name,vol_item,speed_item,select_str]
73+
return newdf
74+
75+
76+
def df_filter(dfs):
77+
for df in dfs:
78+
df['speed']=df['speed'].apply(lambda x : min(x,110))
79+
df['speed']=df['speed'].apply(lambda x : max(x,10))
80+
81+
82+
# 数据是0的点的占比
83+
def miss_rate(data,colname='speed'):
84+
df = pd.isnull(data[colname])
85+
df_list = df.tolist()
86+
miss_rate = sum(df_list)/float(len(df_list))
87+
print ("col : ",colname,", miss rate is : ",miss_rate)
88+
#return miss_rate
89+
90+
91+
# 将dfs聚合
92+
def merge_dfs(dfs,merge_step=3):
93+
begin = 0
94+
end = int(dfs[0].shape[0]/3)
95+
ret_dfs=[]
96+
97+
for df in dfs:
98+
ret_df = pd.DataFrame(columns=['road','vol','speed','last-update-time'])
99+
for step in range(begin,end):
100+
vol_item = df.iloc[step*merge_step:(step+1)*merge_step]['vol'].mean()
101+
speed_item = df.iloc[step*merge_step:(step+1)*merge_step]['speed'].mean()
102+
name = df.iloc[step*merge_step]['road']
103+
time = df.iloc[step*merge_step]['last-update-time']
104+
ret_df.loc[ret_df.shape[0]]=[name,vol_item,speed_item,time]
105+
ret_dfs.append(ret_df)
106+
print ("ori dfs shape is : ",dfs[0].shape)
107+
print ("ret dfs shape is : ",ret_dfs[0].shape)
108+
return ret_dfs
109+
110+
111+
def heatmap2(data):
112+
speed_col_index = 2
113+
114+
height = len(data)
115+
width = data[0].shape[0]
116+
arr = np.zeros((height, width))
117+
118+
for i in range(height):
119+
for j in range(width):
120+
arr[i,j]=data[i].iloc[j,speed_col_index]
121+
plt.matshow(arr, cmap='hot')
122+
plt.colorbar()
123+
plt.show()
124+
125+
def mypickle(filepath,data):
126+
f=open(abs_path+filepath,'wb')
127+
pickle.dump(data,f)
128+
f.close()
129+
130+
131+
abs_path='C:/Users/wwwa8/Documents/GitHub/Machine-Learning/序列预测/PCA去趋势化/dev/'
132+
133+
if __name__=="__main__":
134+
road_dfs=[]
135+
#每一个filename表示一条路的数据
136+
for filename in range(1,3):
137+
#filepath='1.xls'
138+
filepath = abs_path+str(filename)+'.xls'
139+
df = read_excel(filepath)
140+
#起始日期
141+
days=range(8,10)
142+
#每一天每一分钟对应一个点的格式
143+
dfs=[]
144+
begin_hour = 0
145+
end_hour = 24
146+
for day in days:
147+
dfs.append(generate_data_byday(df,day,begin_hour,end_hour))
148+
for df in dfs:
149+
miss_rate(df)
150+
fill_df(dfs)
151+
for df in dfs:
152+
miss_rate(df)
153+
154+
#填充nan
155+
dfs = default_fill(dfs)
156+
#过滤异常值,特别大的,特别小的
157+
df_filter(dfs)
158+
#merge 按几分钟来聚合数据
159+
dfs = merge_dfs(dfs,merge_step=3)
160+
print ("dfs shape : ",len(dfs))
161+
road_dfs.append(dfs)
162+
163+
#data的第一维度表示路段,第二维度表示以merge_step聚合之后的数据
164+
#data =[]
165+
#for road in road_dfs:
166+
# data.append(road[0]) #road是一个list,长度是1,相当于把pandas的数据结构封装在的road[0]里面,通过road[0]来获得数据
167+
168+
#绘制热力图
169+
#heatmap2(data)
170+
171+
#序列化
172+
mypickle('dump.txt',road_dfs)
173+
174+

序列预测/PCA去趋势化/lstm_new.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,14 @@ def myload(filename):
2222
f.close()
2323
return data
2424

25-
rest_x = myload("dump_rest_x_9-13.txt")
25+
#rest_x = myload("dump_rest_x_9-13.txt")
26+
#arr = myload("dump_arr_9-13.txt")
27+
#main_x = myload("dump_main_x_9-13.txt")
28+
29+
rest_x = myload("dump_rest_x.txt")
30+
arr = myload("dump_arr.txt")
31+
main_x = myload("dump_main_x.txt")
32+
2633

2734
rest_x = rest_x.reshape(-1,1)
2835

@@ -71,7 +78,7 @@ def create_dataset(dataset, look_back=1):
7178
model.add(LSTM(3, input_shape=(1, look_back)) )
7279
model.add(Dense(1))
7380
model.compile(loss='mean_squared_error', optimizer='adam')
74-
model.fit(trainX, trainY, epochs=30, batch_size=1, verbose=2)
81+
model.fit(trainX, trainY, epochs=20, batch_size=1, verbose=2)
7582

7683
# make predictions
7784
trainPredict = model.predict(trainX)
@@ -118,8 +125,7 @@ def create_dataset(dataset, look_back=1):
118125

119126
# Test Score: 11.96 RMSE
120127

121-
arr = myload("dump_arr_9-13.txt")
122-
main_x = myload("dump_main_x_9-13.txt")
128+
123129

124130
arr = arr.reshape(-1,1)
125131
main_x = main_x.reshape(-1,1)

0 commit comments

Comments
 (0)