Skip to content

Commit 638d28e

Browse files
author
jinxin0924
committed
2016清明学习爬虫
1 parent 278f5d7 commit 638d28e

9 files changed

+5658
-0
lines changed

Python 爬虫学习笔记.ipynb

Lines changed: 615 additions & 0 deletions
Large diffs are not rendered by default.

stack/stack/stack_spider.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
__author__ = 'Xing'
2+
3+
from scrapy import Spider
4+
from scrapy.selector import Selector
5+
6+
from items import StackItem
7+
8+
9+
class StackSpider(Spider):
10+
name = "stack"
11+
allowed_domains = ["stackoverflow.com"]
12+
start_urls = [
13+
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
14+
]
15+
16+
def parse(self, response):
17+
questions = Selector(response).xpath('//div[@class="summary"]/h3')
18+
19+
for question in questions:
20+
item = StackItem()
21+
item['title'] = question.xpath(
22+
'a[@class="question-hyperlink"]/text()').extract()[0]
23+
item['url'] = question.xpath(
24+
'a[@class="question-hyperlink"]/@href').extract()[0]
25+
yield item

爬取妹子图片.ipynb

Lines changed: 1362 additions & 0 deletions
Large diffs are not rendered by default.

爬妹子V1.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
__author__ = 'Xing'
2+
3+
#简单的实现了爬取图片
4+
5+
import requests
6+
from bs4 import BeautifulSoup
7+
import urllib.request
8+
import urllib
9+
url="http://www.meizitu.com/"
10+
path='/Users/Xing/Documents/Crawler/sexy/'
11+
cnt=0 #照片量统计,以及作为图片名称
12+
visited=set()
13+
14+
head = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
15+
response = requests.get(url,headers=head)
16+
soup = BeautifulSoup(response.text,'lxml')
17+
webList=soup.find_all('a',target='_blank')
18+
for webtext in webList:
19+
# web_url=str(webtext).split(' ')[1].split('=')[1][1:-1]#网页的url
20+
web_url=webtext['href']
21+
try:
22+
if web_url in visited :continue #避免重复
23+
if 'meizitu' not in web_url:continue #避免连接到外面网
24+
photo_web=requests.get(web_url,timeout=5)
25+
visited.add(web_url)
26+
s=BeautifulSoup(photo_web.text,'lxml')
27+
photoList=s.find_all('img')
28+
for photo_text in photoList:
29+
# photo_url=str(photo_text).split(' ')[2].split('=')[1][1:-3]#照片url
30+
photo_url=photo_text['src']
31+
if 'pic.meizitu.com' not in photo_url:continue #避免连接到外面网
32+
if 'limg' in photo_url:continue #小图
33+
print(cnt,photo_url,web_url)
34+
photo = requests.get(photo_url)
35+
with open(path+str(cnt),'wb') as newfile: #图片输出
36+
newfile.write(photo.content)
37+
cnt+=1 #照片数量
38+
print('get'+' '+str(cnt)+'th'+' photo')
39+
if cnt>50:
40+
break
41+
except BaseException as e:
42+
print(e)
43+
print('web',webtext)
44+
print('photo',photo_text)
45+

爬妹子V2.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
__author__ = 'Xing'
2+
3+
# 在v2中,要实现:1)爬取全部图片
4+
5+
6+
7+
import requests
8+
from bs4 import BeautifulSoup
9+
import urllib.request
10+
import urllib
11+
from collections import deque
12+
13+
url = "http://www.meizitu.com/"
14+
path = '/Users/Xing/Documents/Crawler/sexy/'
15+
16+
17+
visited = set() #存放已经爬取过的网址
18+
stack = deque() #存放要探索的网址
19+
stack.append(url) #初始
20+
nameSet=set() #存放已经爬取过的图片名称
21+
TimeOut=5
22+
head = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
23+
totalCnt=0
24+
while stack:
25+
try:
26+
url=stack.popleft()
27+
visited.add(url)
28+
response = requests.session().get(url,headers=head,timeout=TimeOut)
29+
soup = BeautifulSoup(response.content, 'lxml')
30+
webList = soup.find_all('a')
31+
for webText in webList:
32+
web_url = webText.get('href')
33+
if web_url and web_url not in visited: #web_url 非空加入stack
34+
stack.append(web_url)
35+
#查看该页面是否有图片
36+
photoList=soup.find_all('img')
37+
for photoText in photoList:
38+
photoUrl = photoText.get('src')
39+
if 'erweima' in photoUrl:continue #不要二维码的图片
40+
if 'limg' in photoUrl:continue#不要小图
41+
if 'templets' in photoUrl:continue #不要模板图
42+
photoName=photoText.get('alt')
43+
photoStoreName=photoUrl.split('uploads')[1]
44+
if photoName and photoUrl and photoStoreName not in nameSet:
45+
photo = requests.session().get(photoUrl,headers=head,timeout=TimeOut)
46+
with open(path+photoName,'wb') as newfile: #图片输出
47+
newfile.write(photo.content)
48+
nameSet.add(photoStoreName) #存入名称
49+
if totalCnt%20==0:print('get'+' '+str(totalCnt)+'th'+' photo')
50+
totalCnt+=1
51+
if totalCnt>100:
52+
break
53+
except BaseException as e:
54+
print(e)

爬妹子v3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__author__ = 'Xing'

爬草榴视频连接.ipynb

Lines changed: 1735 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)