1
+ #!/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #导入模块
5
+ import urllib2
6
+ import re
7
+ import os
8
+ import glob
9
+
10
+ #设定抓取页数
11
+ page_amount = 2
12
+
13
+ #抓取首页的html代码
14
+ def get_page (url ):
15
+ req = urllib2 .Request (url )
16
+ req .add_header ('User-Agent' ,'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36' ) #缺省部分填上浏览器字符串
17
+ response = urllib2 .urlopen (url )
18
+ html = response .read ().decode ('utf-8' )
19
+ return html
20
+
21
+ #抓取图片
22
+ def read_image (url ):
23
+ req = urllib2 .Request (url )
24
+ req .add_header ('User-Agent' ,'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36' ) #缺省部分填上浏览器字符串
25
+ response = urllib2 .urlopen (url )
26
+ html = response .read ()
27
+ return html
28
+
29
+ #得到当前的最新页面数,从这个页面开始倒着爬,因为用了这个脚本以后以前的图可能已经看过了
30
+ def get_current_page_number (html ):
31
+ match = re .search (r'<span class="current-comment-page">\[(.*)\]</span>' ,html )
32
+ return match .group (1 )
33
+
34
+ #得到图片列表
35
+ def get_picturs_url_list (url ):
36
+ html = get_page (url )
37
+ l = re .findall (r'<p><img src="http://.*.sinaimg.cn/mw600/.*jpg" /></p>' ,html )
38
+ result = []
39
+ for string in l :
40
+ src = re .search (r'"(.*)"' ,string )
41
+ result .append (str (src .group (1 ))) #解决Unicode编码开头问题,有空好好补下编码和字符规范
42
+ return result
43
+
44
+ #下载图片并存储到本地文件夹
45
+ def image_save (url ,number ):
46
+ filename = str (number ) + '.jpg'
47
+ with open (filename ,'wb' ) as fp :
48
+ img = read_image (url )
49
+ fp .write (img )
50
+
51
+ #准备存放图片的文件夹,并进入到指定路径
52
+ def floder_prepare (floder ):
53
+ a = glob .glob ('*' )
54
+ if floder not in a :
55
+ os .mkdir (floder )
56
+ os .chdir (floder )
57
+
58
+ #主函数
59
+ def main ():
60
+ html = get_page ('http://jandan.net/ooxx' )
61
+ number = int (get_current_page_number (html ))
62
+ l = []
63
+ amount = 0
64
+ for n in range (0 ,page_amount ):
65
+ url = 'http://jandan.net/ooxx/page-' + str (number - n ) + '#comments'
66
+ l += get_picturs_url_list (url )
67
+ floder_prepare ('picture' )
68
+ for url in l :
69
+ amount += 1
70
+ image_save (url ,amount )
71
+
72
+ if __name__ == '__main__' :
73
+ main ()
0 commit comments