study4coder
diff --git a/‎Python 爬虫学习笔记.ipynb
Lines changed: 615 additions & 0 deletions b/‎Python 爬虫学习笔记.ipynb
Lines changed: 615 additions & 0 deletions
diff --git a/‎stack/stack/stack_spider.py
Lines changed: 25 additions & 0 deletions b/‎stack/stack/stack_spider.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎爬取妹子图片.ipynb
Lines changed: 1362 additions & 0 deletions b/‎爬取妹子图片.ipynb
Lines changed: 1362 additions & 0 deletions
diff --git a/‎爬妹子V1.py
Lines changed: 45 additions & 0 deletions b/‎爬妹子V1.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎爬妹子V2.py
Lines changed: 54 additions & 0 deletions b/‎爬妹子V2.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎爬妹子v3.py
Lines changed: 1 addition & 0 deletions b/‎爬妹子v3.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎爬草榴视频连接.ipynb
Lines changed: 1735 additions & 0 deletions b/‎爬草榴视频连接.ipynb
Lines changed: 1735 additions & 0 deletions
@@ -0,0 +1,25 @@
+__author__ = 'Xing'
+
+from scrapy import Spider
+from scrapy.selector import Selector
+
+from items import StackItem
+
+
+class StackSpider(Spider):
+    name = "stack"
+    allowed_domains = ["stackoverflow.com"]
+    start_urls = [
+        "http://stackoverflow.com/questions?pagesize=50&sort=newest",
+    ]
+
+    def parse(self, response):
+        questions = Selector(response).xpath('//div[@class="summary"]/h3')
+
+        for question in questions:
+            item = StackItem()
+            item['title'] = question.xpath(
+                'a[@class="question-hyperlink"]/text()').extract()[0]
+            item['url'] = question.xpath(
+                'a[@class="question-hyperlink"]/@href').extract()[0]
+            yield item
@@ -0,0 +1,45 @@
+__author__ = 'Xing'
+
+#简单的实现了爬取图片
+
+import requests
+from bs4 import BeautifulSoup
+import urllib.request
+import urllib
+url="http://www.meizitu.com/"
+path='/Users/Xing/Documents/Crawler/sexy/'
+cnt=0 #照片量统计，以及作为图片名称
+visited=set()
+
+head = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
+response = requests.get(url,headers=head)
+soup = BeautifulSoup(response.text,'lxml')
+webList=soup.find_all('a',target='_blank')
+for webtext in webList:
+    # web_url=str(webtext).split(' ')[1].split('=')[1][1:-1]#网页的url
+    web_url=webtext['href']
+    try:
+        if web_url in visited :continue #避免重复
+        if 'meizitu' not in web_url:continue #避免连接到外面网
+        photo_web=requests.get(web_url,timeout=5)
+        visited.add(web_url)
+        s=BeautifulSoup(photo_web.text,'lxml')
+        photoList=s.find_all('img')
+        for photo_text in photoList:
+            # photo_url=str(photo_text).split(' ')[2].split('=')[1][1:-3]#照片url
+            photo_url=photo_text['src']
+            if 'pic.meizitu.com' not in photo_url:continue #避免连接到外面网
+            if 'limg' in photo_url:continue #小图
+            print(cnt,photo_url,web_url)
+            photo = requests.get(photo_url)
+            with  open(path+str(cnt),'wb') as newfile:  #图片输出
+                newfile.write(photo.content)
+            cnt+=1 #照片数量
+            print('get'+' '+str(cnt)+'th'+' photo')
+        if cnt>50:
+            break
+    except BaseException as e:
+        print(e)
+        print('web',webtext)
+        print('photo',photo_text)
+
@@ -0,0 +1,54 @@
+__author__ = 'Xing'
+
+# 在v2中，要实现：1）爬取全部图片
+
+
+
+import requests
+from bs4 import BeautifulSoup
+import urllib.request
+import urllib
+from collections import deque
+
+url = "http://www.meizitu.com/"
+path = '/Users/Xing/Documents/Crawler/sexy/'
+
+
+visited = set()  #存放已经爬取过的网址
+stack = deque()  #存放要探索的网址
+stack.append(url)  #初始
+nameSet=set() #存放已经爬取过的图片名称
+TimeOut=5
+head = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
+totalCnt=0
+while stack:
+    try:
+        url=stack.popleft()
+        visited.add(url)
+        response = requests.session().get(url,headers=head,timeout=TimeOut)
+        soup = BeautifulSoup(response.content, 'lxml')
+        webList = soup.find_all('a')
+        for webText in webList:
+            web_url = webText.get('href')
+            if web_url and web_url not in visited: #web_url 非空加入stack
+                stack.append(web_url)
+        #查看该页面是否有图片
+        photoList=soup.find_all('img')
+        for photoText in photoList:
+            photoUrl = photoText.get('src')
+            if 'erweima' in photoUrl:continue #不要二维码的图片
+            if 'limg' in photoUrl:continue#不要小图
+            if 'templets' in photoUrl:continue #不要模板图
+            photoName=photoText.get('alt')
+            photoStoreName=photoUrl.split('uploads')[1]
+            if photoName and photoUrl and photoStoreName not in nameSet:
+                photo = requests.session().get(photoUrl,headers=head,timeout=TimeOut)
+                with open(path+photoName,'wb') as newfile:  #图片输出
+                    newfile.write(photo.content)
+                nameSet.add(photoStoreName) #存入名称
+                if totalCnt%20==0:print('get'+' '+str(totalCnt)+'th'+' photo')
+                totalCnt+=1
+        if totalCnt>100:
+            break
+    except BaseException as e:
+        print(e)
@@ -0,0 +1 @@
+__author__ = 'Xing'