Skip to content

Commit 931090a

Browse files
author
jinxin0924
committed
2016清明学习爬虫
1 parent 638d28e commit 931090a

File tree

2 files changed

+227
-1
lines changed

2 files changed

+227
-1
lines changed

爬取妹子图片.ipynb

Lines changed: 156 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1328,14 +1328,169 @@
13281328
"ContentSoup.find_all('img')"
13291329
]
13301330
},
1331+
{
1332+
"cell_type": "code",
1333+
"execution_count": 72,
1334+
"metadata": {
1335+
"collapsed": true
1336+
},
1337+
"outputs": [],
1338+
"source": [
1339+
"from multiprocessing import Pool,Manager"
1340+
]
1341+
},
1342+
{
1343+
"cell_type": "code",
1344+
"execution_count": 75,
1345+
"metadata": {
1346+
"collapsed": false
1347+
},
1348+
"outputs": [],
1349+
"source": [
1350+
"from multiprocessing import Pool,Manager,Queue"
1351+
]
1352+
},
1353+
{
1354+
"cell_type": "code",
1355+
"execution_count": 88,
1356+
"metadata": {
1357+
"collapsed": true
1358+
},
1359+
"outputs": [],
1360+
"source": [
1361+
"t=Queue()"
1362+
]
1363+
},
1364+
{
1365+
"cell_type": "code",
1366+
"execution_count": 89,
1367+
"metadata": {
1368+
"collapsed": true
1369+
},
1370+
"outputs": [],
1371+
"source": [
1372+
"t.put(1)\n",
1373+
"t.put(2)"
1374+
]
1375+
},
1376+
{
1377+
"cell_type": "code",
1378+
"execution_count": 90,
1379+
"metadata": {
1380+
"collapsed": false
1381+
},
1382+
"outputs": [
1383+
{
1384+
"data": {
1385+
"text/plain": [
1386+
"1"
1387+
]
1388+
},
1389+
"execution_count": 90,
1390+
"metadata": {},
1391+
"output_type": "execute_result"
1392+
}
1393+
],
1394+
"source": [
1395+
"t.get()"
1396+
]
1397+
},
1398+
{
1399+
"cell_type": "code",
1400+
"execution_count": 91,
1401+
"metadata": {
1402+
"collapsed": false
1403+
},
1404+
"outputs": [
1405+
{
1406+
"data": {
1407+
"text/plain": [
1408+
"2"
1409+
]
1410+
},
1411+
"execution_count": 91,
1412+
"metadata": {},
1413+
"output_type": "execute_result"
1414+
}
1415+
],
1416+
"source": [
1417+
"t.get()"
1418+
]
1419+
},
1420+
{
1421+
"cell_type": "code",
1422+
"execution_count": 92,
1423+
"metadata": {
1424+
"collapsed": false
1425+
},
1426+
"outputs": [
1427+
{
1428+
"name": "stdout",
1429+
"output_type": "stream",
1430+
"text": [
1431+
"1\n"
1432+
]
1433+
}
1434+
],
1435+
"source": [
1436+
"if t:\n",
1437+
" print(1)"
1438+
]
1439+
},
1440+
{
1441+
"cell_type": "code",
1442+
"execution_count": 94,
1443+
"metadata": {
1444+
"collapsed": false
1445+
},
1446+
"outputs": [
1447+
{
1448+
"data": {
1449+
"text/plain": [
1450+
"True"
1451+
]
1452+
},
1453+
"execution_count": 94,
1454+
"metadata": {},
1455+
"output_type": "execute_result"
1456+
}
1457+
],
1458+
"source": [
1459+
"t.empty()"
1460+
]
1461+
},
13311462
{
13321463
"cell_type": "code",
13331464
"execution_count": null,
13341465
"metadata": {
13351466
"collapsed": true
13361467
},
13371468
"outputs": [],
1338-
"source": []
1469+
"source": [
1470+
"t"
1471+
]
1472+
},
1473+
{
1474+
"cell_type": "code",
1475+
"execution_count": 95,
1476+
"metadata": {
1477+
"collapsed": true
1478+
},
1479+
"outputs": [],
1480+
"source": [
1481+
"p=Pool(4)"
1482+
]
1483+
},
1484+
{
1485+
"cell_type": "code",
1486+
"execution_count": null,
1487+
"metadata": {
1488+
"collapsed": true
1489+
},
1490+
"outputs": [],
1491+
"source": [
1492+
"p."
1493+
]
13391494
}
13401495
],
13411496
"metadata": {

爬妹子v3.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,72 @@
11
__author__ = 'Xing'
2+
3+
# 在v3中,要实现:1)采用多进程爬取图片
4+
#会有timeout错误,还有 Max retries exceeded with url: /a/nvshen.html (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x1065007f0>, 'Connection to www.meizitu.com timed out. (connect timeout=5)'))
5+
#应该都是网络问题
6+
7+
8+
import requests
9+
from bs4 import BeautifulSoup
10+
import urllib.request
11+
import urllib
12+
from collections import deque
13+
from multiprocessing import Pool,Manager,Queue,Lock
14+
15+
16+
def process(url,totalCnt,visited,nameSet):
17+
stack = deque() #存放要探索的网址
18+
stack.append(url)
19+
while stack:
20+
try:
21+
url=stack.popleft()
22+
visited.add(url)
23+
response = requests.session().get(url,headers=head,timeout=TimeOut)
24+
soup = BeautifulSoup(response.content, 'lxml')
25+
webList = soup.find_all('a')
26+
for webText in webList:
27+
web_url = webText.get('href')
28+
if web_url and web_url not in visited: #web_url 非空加入stack
29+
stack.append(web_url)
30+
#查看该页面是否有图片
31+
photoList=soup.find_all('img')
32+
for photoText in photoList:
33+
photoUrl = photoText.get('src')
34+
if 'erweima' in photoUrl:continue #不要二维码的图片
35+
if 'limg' in photoUrl:continue#不要小图
36+
if 'templets' in photoUrl:continue #不要模板图
37+
photoName=photoText.get('alt')
38+
photoStoreName=photoUrl.split('uploads')[1]
39+
if photoName and photoUrl and photoStoreName not in nameSet:
40+
photo = requests.session().get(photoUrl,headers=head,timeout=TimeOut)
41+
with open(path+photoName,'wb') as newfile: #图片输出
42+
newfile.write(photo.content)
43+
nameSet.add(photoStoreName) #存入名称
44+
if totalCnt['cnt']%100==0:print('get'+' '+str(totalCnt['cnt'])+'th'+' photo')
45+
totalCnt['cnt']+=1
46+
47+
except BaseException as e:
48+
print(e)
49+
50+
51+
if __name__ == '__main__':
52+
url = "http://www.meizitu.com/"
53+
path = '/Users/Xing/Documents/Crawler/sexy/'
54+
visited = set() #存放已经爬取过的网址
55+
# stack = Queue() #存放要探索的网址
56+
# stack.put(url) #初始
57+
nameSet=set() #存放已经爬取过的图片名称
58+
TimeOut=5
59+
head = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
60+
totalCnt=Manager().dict()
61+
totalCnt['cnt']=0
62+
p=Pool(4)
63+
# lock=Lock()
64+
response = requests.session().get(url,headers=head,timeout=TimeOut)
65+
soup = BeautifulSoup(response.content, 'lxml')
66+
webList = soup.find_all('a')
67+
for webText in webList:
68+
web_url = webText.get('href')
69+
if web_url and web_url not in visited: #web_url 非空加入stack
70+
p.apply_async(process,args=(url,totalCnt,visited,nameSet))
71+
p.close()
72+
p.join()

0 commit comments

Comments
 (0)