1
1
#Github项目分析一
2
2
3
- #用matplotlib生成图表
3
+ ## 用matplotlib生成图表
4
4
5
5
如何分析用户的数据是一个有趣的问题,特别是当我们有大量的数据的时候。
6
6
除了`` matlab `` ,我们还可以用`` numpy `` +`` matplotlib ``
7
7
8
- ##python github用户数据分析##
8
+ ### python github用户数据分析##
9
9
10
10
数据可以在这边寻找到
11
11
12
12
[ https://github.com/gmszone/ml ] ( https://github.com/gmszone/ml )
13
13
14
14
最后效果图
15
- <img src =" https://raw.githubusercontent.com/gmszone/ml/master/screenshots/2014-01-01.png " width =600 >
15
+
16
+ ![ 2014 01 01] ( ./img/2014-01-01.png )
16
17
17
18
要解析的json文件位于`` data/2014-01-01-0.json `` ,大小6.6M,显然我们可能需要用每次只读一行的策略,这足以解释为什么诸如sublime打开的时候很慢,而现在我们只需要里面的json数据中的创建时间。。
18
19
19
- ==
20
- 这个文件代表什么?
20
+ ==这个文件代表什么?
21
21
22
22
** 2014年1月1日零时到一时,用户在github上的操作,这里的用户指的是很多。。一共有4814条数据,从commit、create到issues都有。**
23
23
24
- ##python json文件解析##
24
+ ### python json文件解析##
25
25
26
- import json
27
- for line in open(jsonfile):
28
- line = f.readline()
26
+ ``` python
27
+ import json
28
+ for line in open (jsonfile):
29
+ line = f.readline()
30
+ ```
29
31
30
32
然后再解析json
31
- <pre ><code class =" python " >
33
+
34
+ ``` python
32
35
import dateutil.parser
33
36
34
37
lin = json.loads(line)
35
38
date = dateutil.parser.parse(lin[" created_at" ])
36
- </code ></pre >
39
+ ```
40
+
37
41
这里用到了`` dateutil `` ,因为新鲜出炉的数据是string需要转换为`` dateutil `` ,再到数据放到数组里头。最后有就有了`` parse_data ``
38
42
43
+ ``` python
39
44
def parse_data (jsonfile ):
40
45
f = open (jsonfile, " r" )
41
46
dataarray = []
@@ -51,21 +56,27 @@ def parse_data(jsonfile):
51
56
minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
52
57
f.close()
53
58
return minuteswithcount
54
-
59
+ ```
55
60
56
61
下面这句代码就是将上面的解析为
57
62
58
- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
63
+ ``` python
64
+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
65
+ ```
59
66
60
67
这样的数组以便于解析
61
68
62
- [(0, 92), (1, 67), (2, 86), (3, 73), (4, 76), (5, 67), (6, 61), (7, 71), (8, 62), (9, 71), (10, 70), (11, 79), (12, 62), (13, 67), (14, 76), (15, 67), (16, 74), (17, 48), (18, 78), (19, 73), (20, 89), (21, 62), (22, 74), (23, 61), (24, 71), (25, 49), (26, 59), (27, 59), (28, 58), (29, 74), (30, 69), (31, 59), (32, 89), (33, 67), (34, 66), (35, 77), (36, 64), (37, 71), (38, 75), (39, 66), (40, 62), (41, 77), (42, 82), (43, 95), (44, 77), (45, 65), (46, 59), (47, 60), (48, 54), (49, 66), (50, 74), (51, 61), (52, 71), (53, 90), (54, 64), (55, 67), (56, 67), (57, 55), (58, 68), (59, 91)]
69
+ ``` python
70
+ [(0 , 92 ), (1 , 67 ), (2 , 86 ), (3 , 73 ), (4 , 76 ), (5 , 67 ), (6 , 61 ), (7 , 71 ), (8 , 62 ), (9 , 71 ), (10 , 70 ), (11 , 79 ), (12 , 62 ), (13 , 67 ), (14 , 76 ), (15 , 67 ), (16 , 74 ), (17 , 48 ), (18 , 78 ), (19 , 73 ), (20 , 89 ), (21 , 62 ), (22 , 74 ), (23 , 61 ), (24 , 71 ), (25 , 49 ), (26 , 59 ), (27 , 59 ), (28 , 58 ), (29 , 74 ), (30 , 69 ), (31 , 59 ), (32 , 89 ), (33 , 67 ), (34 , 66 ), (35 , 77 ), (36 , 64 ), (37 , 71 ), (38 , 75 ), (39 , 66 ), (40 , 62 ), (41 , 77 ), (42 , 82 ), (43 , 95 ), (44 , 77 ), (45 , 65 ), (46 , 59 ), (47 , 60 ), (48 , 54 ), (49 , 66 ), (50 , 74 ), (51 , 61 ), (52 , 71 ), (53 , 90 ), (54 , 64 ), (55 , 67 ), (56 , 67 ), (57 , 55 ), (58 , 68 ), (59 , 91 )]
71
+ ```
63
72
64
- ##matplotlib##
65
- 开始之前需要安装``matplotlib
73
+ ##matplotlib
66
74
67
- sudo pip install matplotlib
75
+ 开始之前需要安装`` matplotlib
68
76
77
+ ``` bash
78
+ sudo pip install matplotlib
79
+ ```
69
80
然后引入这个库
70
81
71
82
import matplotlib.pyplot as plt
@@ -81,67 +92,68 @@ def parse_data(jsonfile):
81
92
82
93
最后代码可见
83
94
84
- #!/usr/bin/env python
85
- # -*- coding: utf-8 -*-
86
-
87
- import json
88
- import dateutil.parser
89
- import numpy as np
90
- import matplotlib.mlab as mlab
91
- import matplotlib.pyplot as plt
92
-
93
-
94
- def parse_data(jsonfile):
95
- f = open(jsonfile, "r")
96
- dataarray = []
97
- datacount = 0
98
-
99
- for line in open(jsonfile):
100
- line = f.readline()
101
- lin = json.loads(line)
102
- date = dateutil.parser.parse(lin["created_at"])
103
- datacount += 1
104
- dataarray.append(date.minute)
105
-
106
- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
107
- f.close()
108
- return minuteswithcount
109
-
110
-
111
- def draw_date(files):
112
- x = []
113
- y = []
114
- mwcs = parse_data(files)
115
- for mwc in mwcs:
116
- x.append(mwc[0])
117
- y.append(mwc[1])
118
-
119
- plt.figure(figsize=(8,4))
120
- plt.plot(x, y,label = files)
121
- plt.legend()
122
- plt.show()
123
-
124
- draw_date("data/2014-01-01-0.json")
125
-
126
-
127
- #每周分析
95
+
96
+ ``` python
97
+ # !/usr/bin/env python
98
+ # -*- coding: utf-8 -*-
99
+
100
+ import json
101
+ import dateutil.parser
102
+ import numpy as np
103
+ import matplotlib.mlab as mlab
104
+ import matplotlib.pyplot as plt
105
+
106
+
107
+ def parse_data (jsonfile ):
108
+ f = open (jsonfile, " r" )
109
+ dataarray = []
110
+ datacount = 0
111
+
112
+ for line in open (jsonfile):
113
+ line = f.readline()
114
+ lin = json.loads(line)
115
+ date = dateutil.parser.parse(lin[" created_at" ])
116
+ datacount += 1
117
+ dataarray.append(date.minute)
118
+
119
+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
120
+ f.close()
121
+ return minuteswithcount
122
+
123
+
124
+ def draw_date (files ):
125
+ x = []
126
+ y = []
127
+ mwcs = parse_data(files)
128
+ for mwc in mwcs:
129
+ x.append(mwc[0 ])
130
+ y.append(mwc[1 ])
131
+
132
+ plt.figure(figsize = (8 ,4 ))
133
+ plt.plot(x, y,label = files)
134
+ plt.legend()
135
+ plt.show()
136
+
137
+ draw_date(" data/2014-01-01-0.json" )
138
+ ```
139
+
140
+ ##每周分析
128
141
129
142
继上篇之后,我们就可以分析用户的每周提交情况,以得出用户的真正的工具效率,每个程序员的工作时间可能是不一样的,如
130
- ![ Phodal Huang's Report] [ 1 ]
131
143
132
- [ 1 ] : https://www.phodal.com/static/media/uploads/screen_shot_2014-04-12_at_9.58.52_am.png
144
+ ![ Phodal Huang's Report ] ( ./img/phodal-results )
133
145
134
146
这是我的每周情况,显然如果把星期六移到前面的话,随着工作时间的增长,在github上的使用在下降,作为一个
135
147
136
148
a fulltime hacker who works best in the evening (around 8 pm).
137
149
138
150
不过这个是osrc的分析结果。
139
151
140
- ##python github 每周情况分析##
152
+ ### python github 每周情况分析
141
153
142
154
看一张分析后的结果
143
155
144
- < img src = " https://raw.githubusercontent.com/gmszone/ml/master/screenshots/ feb-results.png" width = 600 >
156
+ ![ Feb Results ] ( ./img/ feb-results.png)
145
157
146
158
结果正好与我的情况相反?似乎图上是这么说的,但是数据上是这样的情况。
147
159
@@ -174,67 +186,71 @@ def parse_data(jsonfile):
174
186
8474, 7984, 12933, 13504, 13763, 13544, 12940,
175
187
7119, 7346, 13412, 14008, 12555
176
188
177
- ##python 数据分析##
189
+ ### python 数据分析
178
190
179
191
重写了一个新的方法用于计算提交数,直至后面才意识到其实我们可以算行数就够了,但是方法上有点hack
180
192
181
- <pre ><code class =" python " >
182
- def get_minutes_counts_with_id(jsonfile):
183
- datacount, dataarray = handle_json(jsonfile)
184
- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
185
- return minuteswithcount
186
-
187
-
188
- def handle_json(jsonfile):
189
- f = open(jsonfile, "r")
190
- dataarray = []
191
- datacount = 0
192
-
193
- for line in open(jsonfile):
194
- line = f.readline()
195
- lin = json.loads(line)
196
- date = dateutil.parser.parse(lin["created_at"])
197
- datacount += 1
198
- dataarray.append(date.minute)
199
-
200
- f.close()
201
- return datacount, dataarray
202
-
203
-
204
- def get_minutes_count_num(jsonfile):
205
- datacount, dataarray = handle_json(jsonfile)
206
- return datacount
207
-
208
-
209
- def get_month_total():
210
- """
211
-
212
- :rtype : object
213
- """
214
- monthdaycount = []
215
- for i in range(1, 20):
216
- if i < 10:
217
- filename = 'data/2014-02-0' + i.__str__() + '-0.json'
218
- else:
219
- filename = 'data/2014-02-' + i.__str__() + '-0.json'
220
- monthdaycount.append(get_minutes_count_num(filename))
221
- return monthdaycount
222
- </code ></pre >
193
+ ``` python
194
+ def get_minutes_counts_with_id (jsonfile ):
195
+ datacount, dataarray = handle_json(jsonfile)
196
+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
197
+ return minuteswithcount
198
+
199
+
200
+ def handle_json (jsonfile ):
201
+ f = open (jsonfile, " r" )
202
+ dataarray = []
203
+ datacount = 0
204
+
205
+ for line in open (jsonfile):
206
+ line = f.readline()
207
+ lin = json.loads(line)
208
+ date = dateutil.parser.parse(lin[" created_at" ])
209
+ datacount += 1
210
+ dataarray.append(date.minute)
211
+
212
+ f.close()
213
+ return datacount, dataarray
214
+
215
+
216
+ def get_minutes_count_num (jsonfile ):
217
+ datacount, dataarray = handle_json(jsonfile)
218
+ return datacount
219
+
220
+
221
+ def get_month_total ():
222
+ """
223
+
224
+ :rtype : object
225
+ """
226
+ monthdaycount = []
227
+ for i in range (1 , 20 ):
228
+ if i < 10 :
229
+ filename = ' data/2014-02-0' + i.__str__ () + ' -0.json'
230
+ else :
231
+ filename = ' data/2014-02-' + i.__str__ () + ' -0.json'
232
+ monthdaycount.append(get_minutes_count_num(filename))
233
+ return monthdaycount
234
+ ```
235
+
223
236
接着我们需要去遍历每个结果,后面的后面会发现这个效率真的是太低了,为什么木有多线程?
224
237
225
- ##python matplotlib图表##
238
+ ###python matplotlib图表
239
+
226
240
让我们的matplotlib来做这些图表的工作
227
241
228
- if __name__ == '__main__':
229
- results = pd.get_month_total()
230
- print results
231
-
232
- plt.figure(figsize=(8, 4))
233
- plt.plot(results.__getslice__(0, 7), label="first week")
234
- plt.plot(results.__getslice__(7, 14), label="second week")
235
- plt.plot(results.__getslice__(14, 21), label="third week")
236
- plt.legend()
237
- plt.show()
242
+ ``` python
243
+ if __name__ == ' __main__' :
244
+ results = pd.get_month_total()
245
+ print results
246
+
247
+ plt.figure(figsize = (8 , 4 ))
248
+ plt.plot(results.__getslice__ (0 , 7 ), label = " first week" )
249
+ plt.plot(results.__getslice__ (7 , 14 ), label = " second week" )
250
+ plt.plot(results.__getslice__ (14 , 21 ), label = " third week" )
251
+ plt.legend()
252
+ plt.show()
253
+ ```
238
254
239
255
蓝色的是第一周,绿色的是第二周,蓝色的是第三周就有了上面的结果。
240
256
0 commit comments