Skip to content

Commit 24bc113

Browse files
committed
你懂的
1 parent 24dda92 commit 24bc113

File tree

1 file changed

+73
-0
lines changed

1 file changed

+73
-0
lines changed

Jimmy66/0013/girls_pictures.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
#导入模块
5+
import urllib2
6+
import re
7+
import os
8+
import glob
9+
10+
#设定抓取页数
11+
page_amount = 2
12+
13+
#抓取首页的html代码
14+
def get_page(url):
15+
req = urllib2.Request(url)
16+
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36') #缺省部分填上浏览器字符串
17+
response = urllib2.urlopen(url)
18+
html = response.read().decode('utf-8')
19+
return html
20+
21+
#抓取图片
22+
def read_image(url):
23+
req = urllib2.Request(url)
24+
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36') #缺省部分填上浏览器字符串
25+
response = urllib2.urlopen(url)
26+
html = response.read()
27+
return html
28+
29+
#得到当前的最新页面数,从这个页面开始倒着爬,因为用了这个脚本以后以前的图可能已经看过了
30+
def get_current_page_number(html):
31+
match = re.search(r'<span class="current-comment-page">\[(.*)\]</span>',html)
32+
return match.group(1)
33+
34+
#得到图片列表
35+
def get_picturs_url_list(url):
36+
html = get_page(url)
37+
l = re.findall(r'<p><img src="http://.*.sinaimg.cn/mw600/.*jpg" /></p>',html)
38+
result = []
39+
for string in l:
40+
src = re.search(r'"(.*)"',string)
41+
result.append(str(src.group(1))) #解决Unicode编码开头问题,有空好好补下编码和字符规范
42+
return result
43+
44+
#下载图片并存储到本地文件夹
45+
def image_save(url,number):
46+
filename = str(number) + '.jpg'
47+
with open(filename,'wb') as fp:
48+
img = read_image(url)
49+
fp.write(img)
50+
51+
#准备存放图片的文件夹,并进入到指定路径
52+
def floder_prepare(floder):
53+
a = glob.glob('*')
54+
if floder not in a:
55+
os.mkdir(floder)
56+
os.chdir(floder)
57+
58+
#主函数
59+
def main():
60+
html = get_page('http://jandan.net/ooxx')
61+
number = int(get_current_page_number(html))
62+
l = []
63+
amount = 0
64+
for n in range(0,page_amount):
65+
url = 'http://jandan.net/ooxx/page-' + str(number-n) + '#comments'
66+
l += get_picturs_url_list(url)
67+
floder_prepare('picture')
68+
for url in l:
69+
amount += 1
70+
image_save(url,amount)
71+
72+
if __name__ == '__main__':
73+
main()

0 commit comments

Comments
 (0)