Skip to content

Commit eb8ed92

Browse files
committed
加入三种方法爬取豌豆荚设计奖速度对比
1 parent bbd1c16 commit eb8ed92

File tree

2 files changed

+73
-28
lines changed

2 files changed

+73
-28
lines changed

.idea/workspace.xml

Lines changed: 14 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python爬虫日记系列/Python爬虫日记九:豌豆荚设计奖三种爬取方法效率对比.py

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import random
33
import requests
44
import pymongo
5+
import aiohttp
6+
import asyncio
57
from bs4 import BeautifulSoup
68
import multiprocessing
79

@@ -59,16 +61,16 @@
5961
}
6062

6163
proxies = {
62-
'http': '123.206.6.17:8080',
63-
# 'https':'123.206.6.17:80'
64+
'http': 'http://123.206.6.17:3128',
65+
'https': 'http://123.206.6.17:3128'
6466
}
6567

6668

67-
# 方式一:使用requests + BeautifulSoup
69+
# 方式一:使用常见的requests
6870
def method_1():
6971
start = time.time()
7072
for url in urls:
71-
html = requests.get(url, headers=headers).text
73+
html = requests.get(url, headers=headers, proxies=proxies).text
7274
soup = BeautifulSoup(html, 'lxml')
7375
title = soup.find_all(class_='title')
7476
app_title = soup.find_all(class_='app-title')
@@ -82,13 +84,20 @@ def method_1():
8284
'icon_cover': icon_cover_i['data-original']
8385
}
8486
col.insert(content)
85-
# print('成功插入一组数据' + str(content))
87+
print('成功插入一组数据' + str(content))
8688
print('一共用时:' + str(time.time() - start))
8789

8890

89-
# 方式二:使用Requests + BeautifulSoup + Pool
91+
# if __name__ == '__main__':
92+
# method_1()
93+
94+
95+
96+
97+
98+
# 方式二:使用Requests + Pool
9099
def method_2(url):
91-
html = requests.get(url, headers=headers).text
100+
html = requests.get(url, headers=headers, proxies=proxies).text
92101
soup = BeautifulSoup(html, 'lxml')
93102
title = soup.find_all(class_='title')
94103
app_title = soup.find_all(class_='app-title')
@@ -103,13 +112,49 @@ def method_2(url):
103112
}
104113
# time.sleep(1)
105114
col.insert(content)
106-
# print('成功插入一组数据' + str(content))
115+
print('成功插入一组数据' + str(content))
107116

108117

109-
if __name__ == '__main__':
118+
# if __name__ == '__main__':
119+
# start = time.time()
120+
# pool = multiprocessing.Pool(4)
121+
# pool.map(method_2, urls)
122+
# pool.close()
123+
# pool.join()
124+
# print('一共用时:' + str(time.time() - start))
125+
126+
127+
# 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
128+
129+
def method_3():
130+
async def get_url(url):
131+
async with aiohttp.ClientSession() as session:
132+
async with session.get(url) as html:
133+
response = await html.text(encoding="utf-8")
134+
return response
135+
136+
async def parser(url):
137+
html = await get_url(url)
138+
soup = BeautifulSoup(html, 'lxml')
139+
title = soup.find_all(class_='title')
140+
app_title = soup.find_all(class_='app-title')
141+
item_cover = soup.find_all(class_='item-cover')
142+
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
143+
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
144+
content = {
145+
'title': title_i.get_text(),
146+
'app_title': app_title_i.get_text(),
147+
'item_cover': item_cover_i['data-original'],
148+
'icon_cover': icon_cover_i['data-original']
149+
}
150+
col.insert(content)
151+
print('成功插入一组数据' + str(content))
110152
start = time.time()
111-
pool = multiprocessing.Pool(4)
112-
pool.map(method_2, urls)
113-
pool.close()
114-
pool.join()
115-
print('一共用时:' + str(time.time() - start))
153+
loop = asyncio.get_event_loop()
154+
tasks = [parser(url) for url in urls]
155+
loop.run_until_complete(asyncio.gather(*tasks))
156+
print(time.time() - start)
157+
158+
159+
if __name__ == '__main__':
160+
method_3()

0 commit comments

Comments
 (0)