2
2
import random
3
3
import requests
4
4
import pymongo
5
+ import aiohttp
6
+ import asyncio
5
7
from bs4 import BeautifulSoup
6
8
import multiprocessing
7
9
59
61
}
60
62
61
63
proxies = {
62
- 'http' : '123.206.6.17:8080 ' ,
63
- # 'https':' 123.206.6.17:80 '
64
+ 'http' : 'http:// 123.206.6.17:3128 ' ,
65
+ 'https' : 'http:// 123.206.6.17:3128 '
64
66
}
65
67
66
68
67
- # 方式一:使用requests + BeautifulSoup
69
+ # 方式一:使用常见的requests
68
70
def method_1 ():
69
71
start = time .time ()
70
72
for url in urls :
71
- html = requests .get (url , headers = headers ).text
73
+ html = requests .get (url , headers = headers , proxies = proxies ).text
72
74
soup = BeautifulSoup (html , 'lxml' )
73
75
title = soup .find_all (class_ = 'title' )
74
76
app_title = soup .find_all (class_ = 'app-title' )
@@ -82,13 +84,20 @@ def method_1():
82
84
'icon_cover' : icon_cover_i ['data-original' ]
83
85
}
84
86
col .insert (content )
85
- # print('成功插入一组数据' + str(content))
87
+ print ('成功插入一组数据' + str (content ))
86
88
print ('一共用时:' + str (time .time () - start ))
87
89
88
90
89
- # 方式二:使用Requests + BeautifulSoup + Pool
91
+ # if __name__ == '__main__':
92
+ # method_1()
93
+
94
+
95
+
96
+
97
+
98
+ # 方式二:使用Requests + Pool
90
99
def method_2 (url ):
91
- html = requests .get (url , headers = headers ).text
100
+ html = requests .get (url , headers = headers , proxies = proxies ).text
92
101
soup = BeautifulSoup (html , 'lxml' )
93
102
title = soup .find_all (class_ = 'title' )
94
103
app_title = soup .find_all (class_ = 'app-title' )
@@ -103,13 +112,49 @@ def method_2(url):
103
112
}
104
113
# time.sleep(1)
105
114
col .insert (content )
106
- # print('成功插入一组数据' + str(content))
115
+ print ('成功插入一组数据' + str (content ))
107
116
108
117
109
- if __name__ == '__main__' :
118
+ # if __name__ == '__main__':
119
+ # start = time.time()
120
+ # pool = multiprocessing.Pool(4)
121
+ # pool.map(method_2, urls)
122
+ # pool.close()
123
+ # pool.join()
124
+ # print('一共用时:' + str(time.time() - start))
125
+
126
+
127
+ # 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
128
+
129
+ def method_3 ():
130
+ async def get_url (url ):
131
+ async with aiohttp .ClientSession () as session :
132
+ async with session .get (url ) as html :
133
+ response = await html .text (encoding = "utf-8" )
134
+ return response
135
+
136
+ async def parser (url ):
137
+ html = await get_url (url )
138
+ soup = BeautifulSoup (html , 'lxml' )
139
+ title = soup .find_all (class_ = 'title' )
140
+ app_title = soup .find_all (class_ = 'app-title' )
141
+ item_cover = soup .find_all (class_ = 'item-cover' )
142
+ icon_cover = soup .select ('div.list-wrap > ul > li > div.icon > img' )
143
+ for title_i , app_title_i , item_cover_i , icon_cover_i in zip (title , app_title , item_cover , icon_cover ):
144
+ content = {
145
+ 'title' : title_i .get_text (),
146
+ 'app_title' : app_title_i .get_text (),
147
+ 'item_cover' : item_cover_i ['data-original' ],
148
+ 'icon_cover' : icon_cover_i ['data-original' ]
149
+ }
150
+ col .insert (content )
151
+ print ('成功插入一组数据' + str (content ))
110
152
start = time .time ()
111
- pool = multiprocessing .Pool (4 )
112
- pool .map (method_2 , urls )
113
- pool .close ()
114
- pool .join ()
115
- print ('一共用时:' + str (time .time () - start ))
153
+ loop = asyncio .get_event_loop ()
154
+ tasks = [parser (url ) for url in urls ]
155
+ loop .run_until_complete (asyncio .gather (* tasks ))
156
+ print (time .time () - start )
157
+
158
+
159
+ if __name__ == '__main__' :
160
+ method_3 ()
0 commit comments