Skip to content

Commit 2fd69e0

Browse files
committed
Push Code
1 parent 40bbf02 commit 2fd69e0

File tree

5 files changed

+512
-110
lines changed

5 files changed

+512
-110
lines changed

Chapter 16/16_2.py

Lines changed: 113 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,113 @@
1-
"""
2-
Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图
3-
"""
4-
import time
5-
import random
6-
import requests as r
7-
import os
8-
import re
9-
import json
10-
11-
# 图片拼接url后,分别是前缀后缀
12-
img_start_url = 'http://img.hb.aicdn.com/'
13-
img_end = '_fw658'
14-
15-
# 获取pins的正则
16-
boards_pattern = re.compile(r'pins":(.*)};')
17-
18-
# 修改pin_id的正则
19-
max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)')
20-
21-
# 图片id输出文件
22-
pin_ids_file = 'pin_ids.txt'
23-
24-
# 图片输出路径
25-
pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/')
26-
27-
ajax_headers = {
28-
'Host': 'huaban.com',
29-
'Accept': 'application/json',
30-
'X-Request': 'JSON',
31-
'X-Requested-With': 'XMLHttpRequest'
32-
}
33-
34-
# 往文件写入内容(追加)
35-
def write_str_data(content, file_path, type="a+"):
36-
try:
37-
with open(file_path, type, encoding='utf-8') as f:
38-
f.write(content + "\n", )
39-
except OSError as reason:
40-
print(str(reason))
41-
42-
# 按行读取文件里的内容添加到列表中返回
43-
def load_data(file_path):
44-
if os.path.exists(file_path):
45-
data_list = []
46-
with open(file_path, "r+", encoding='utf-8') as f:
47-
for ip in f:
48-
data_list.append(ip.replace("\n", ""))
49-
return data_list
50-
51-
# 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询
52-
def get_boards_index_data(url):
53-
print("请求:" + url)
54-
resp = r.get(url).text
55-
result = boards_pattern.search(resp)
56-
json_dict = json.loads(result.group(1))
57-
for item in json_dict:
58-
write_str_data(item['file']['key'], pin_ids_file)
59-
# 返回最后一个pin_id
60-
pin_id = json_dict[-1]['pin_id']
61-
return pin_id
62-
63-
64-
# 模拟Ajax请求更多数据
65-
def get_json_list(url):
66-
print("请求:" + url)
67-
resp = r.get(url, headers=ajax_headers)
68-
if resp is None:
69-
return None
70-
else:
71-
json_dict = json.loads(resp.text)
72-
pins = json_dict['board']['pins']
73-
if len(pins) == 0:
74-
return None
75-
else:
76-
for item in pins:
77-
write_str_data(item['file']['key'], pin_ids_file)
78-
return pins[-1]['pin_id']
79-
80-
81-
# 下载图片的方法
82-
def download_pic(key):
83-
url = img_start_url + key + img_end
84-
resp = r.get(url).content
85-
try:
86-
print("下载图片:" + url)
87-
pic_name = key + ".jpg"
88-
with open(pic_download_dir + pic_name, "wb+") as f:
89-
f.write(resp)
90-
except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason:
91-
print(str(reason))
92-
93-
94-
if __name__ == '__main__':
95-
if not os.path.exists(pic_download_dir):
96-
os.makedirs(pic_download_dir)
97-
if os.path.exists(pin_ids_file):
98-
os.remove(pin_ids_file)
99-
# 一个画板链接,可自行替换
100-
boards_url = 'http://huaban.com/boards/279523/'
101-
board_last_pin_id = get_boards_index_data(boards_url)
102-
board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1'
103-
while True:
104-
board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url))
105-
if board_last_pin_id is None:
106-
break
107-
pic_url_list = load_data(pin_ids_file)
108-
for key in pic_url_list:
109-
download_pic(key)
110-
print("下载完成~")
1+
"""
2+
Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图
3+
"""
4+
import requests as r
5+
import os
6+
import re
7+
import json
8+
9+
# 图片URL拼接的前缀和后缀
10+
img_start_url = 'http://img.hb.aicdn.com/'
11+
img_end = '_fw658'
12+
13+
# 图片key的保存文件
14+
pic_key_file = 'pin_ids.txt'
15+
16+
# 获取pins的正则
17+
boards_pattern = re.compile(r'pins":(.*)};')
18+
19+
# 修改pin_id的正则
20+
max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)')
21+
22+
# 图片保存路径
23+
pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/')
24+
25+
# Ajax模拟的请求头
26+
ajax_headers = {
27+
'Host': 'huaban.com',
28+
'Accept': 'application/json',
29+
'X-Request': 'JSON',
30+
'X-Requested-With': 'XMLHttpRequest'
31+
}
32+
33+
34+
# 以追加的形式往文件中写入内容
35+
def write_str_data(content, file_path):
36+
try:
37+
with open(file_path, 'a+', encoding='utf-8') as f:
38+
f.write(content + "\n", )
39+
except OSError as reason:
40+
print(str(reason))
41+
42+
43+
# 按行读取文件里的内容添加到列表中返回
44+
def load_data(file_path):
45+
if os.path.exists(file_path):
46+
data_list = []
47+
with open(file_path, "r+", encoding='utf-8') as f:
48+
for ip in f:
49+
data_list.append(ip.replace("\n", ""))
50+
return data_list
51+
52+
53+
# 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询
54+
def get_boards_index_data(url):
55+
print("请求:" + url)
56+
resp = r.get(url).text
57+
result = boards_pattern.search(resp)
58+
json_dict = json.loads(result.group(1))
59+
for item in json_dict:
60+
write_str_data(item['file']['key'], pic_key_file)
61+
# 返回最后一个pin_id
62+
pin_id = json_dict[-1]['pin_id']
63+
return pin_id
64+
65+
66+
# 模拟Ajax请求更多数据
67+
def get_json_list(url):
68+
print("请求:" + url)
69+
resp = r.get(url, headers=ajax_headers)
70+
if resp is None:
71+
return None
72+
else:
73+
json_dict = json.loads(resp.text)
74+
pins = json_dict['board']['pins']
75+
if len(pins) == 0:
76+
return None
77+
else:
78+
for item in pins:
79+
write_str_data(item['file']['key'], pic_key_file)
80+
return pins[-1]['pin_id']
81+
82+
83+
# 下载图片的方法
84+
def download_pic(key):
85+
url = img_start_url + key + img_end
86+
resp = r.get(url).content
87+
try:
88+
print("下载图片:" + url)
89+
pic_name = key + ".jpg"
90+
with open(pic_download_dir + pic_name, "wb+") as f:
91+
f.write(resp)
92+
except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason:
93+
print(str(reason))
94+
95+
96+
if __name__ == '__main__':
97+
if not os.path.exists(pic_download_dir):
98+
os.makedirs(pic_download_dir)
99+
# 判断图片key的保存文件是否存在,存在的话删除
100+
if os.path.exists(pic_key_file):
101+
os.remove(pic_key_file)
102+
# 一个画板链接,可自行替换
103+
boards_url = 'http://huaban.com/boards/279523/'
104+
board_last_pin_id = get_boards_index_data(boards_url)
105+
board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1'
106+
while True:
107+
board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url))
108+
if board_last_pin_id is None:
109+
break
110+
pic_url_list = load_data(pic_key_file)
111+
for key in pic_url_list:
112+
download_pic(key)
113+
print("所有图片下载完成~")

Chapter 16/16_3.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""
2+
selenium使用示例
3+
"""
4+
from selenium import webdriver
5+
6+
browser = webdriver.Chrome() # 调用本地的Chrome浏览器
7+
browser.get('http://www.baidu.com') # 请求页面,会打开一个浏览器窗口
8+
html_text = browser.page_source # 获得页面代码
9+
# browser.quit() # 关闭浏览器
10+
print(html_text)

Chapter 16/16_4.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""
2+
selenium爬取简单网无聊图示例
3+
"""
4+
import os
5+
from selenium import webdriver
6+
import redis
7+
import requests as r
8+
from bs4 import BeautifulSoup
9+
10+
# 请求基地址
11+
base_url = 'http://jandan.net/pic'
12+
# 图片的保存路径
13+
pic_save_path = os.path.join(os.getcwd(), 'JianDan/')
14+
# 图片需要,作为Reids键用
15+
pic_count = 0
16+
17+
# 下载图片用headers
18+
pic_headers = {
19+
'Host': 'wx2.sinaimg.cn',
20+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
21+
'Chrome/61.0.3163.100 Safari/537.36 '
22+
}
23+
24+
25+
# 打开浏览器模拟请求
26+
def browser_get():
27+
browser = webdriver.Chrome()
28+
browser.get(base_url)
29+
html_text = browser.page_source
30+
page_count = get_page_count(html_text)
31+
# 循环拼接URL访问
32+
for page in range(page_count, 0, -1):
33+
page_url = base_url + '/page-' + str(page)
34+
print('解析:' + page_url)
35+
browser.get(page_url)
36+
html = browser.page_source
37+
get_meizi_url(html)
38+
# 没有更多了关闭浏览器
39+
browser.quit()
40+
41+
42+
# 获取总页码
43+
def get_page_count(html):
44+
bs = BeautifulSoup(html, 'lxml')
45+
page_count = bs.find('span', attrs={'class': 'current-comment-page'})
46+
return int(page_count.get_text()[1:-1]) - 1
47+
48+
49+
# 获取每页的图片
50+
def get_meizi_url(html):
51+
soup = BeautifulSoup(html, 'html.parser')
52+
ol = soup.find('ol', attrs={'class': 'commentlist'})
53+
href = ol.findAll('a', attrs={'class': 'view_img_link'})
54+
global pic_count
55+
for a in href:
56+
dan_redis.set(str(pic_count), a['href'])
57+
pic_count += 1
58+
59+
60+
# 下载图片
61+
def download_pic(url):
62+
correct_url = url
63+
if url.startswith('//'):
64+
correct_url = url[2:]
65+
if not url.startswith('http'):
66+
correct_url = 'http://' + correct_url
67+
print("下载:", correct_url)
68+
try:
69+
resp = r.get(correct_url, headers=pic_headers).content
70+
pic_name = correct_url.split("/")[-1]
71+
with open(pic_save_path + pic_name, "wb+") as f:
72+
f.write(resp)
73+
except (OSError, r.ConnectionError, r.HTTPError, Exception) as reason:
74+
print(str(reason))
75+
76+
77+
if __name__ == '__main__':
78+
pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=1)
79+
dan_redis = redis.StrictRedis(connection_pool=pool)
80+
if not os.path.exists(pic_save_path):
81+
os.makedirs(pic_save_path)
82+
browser_get()
83+
results = dan_redis.mget(dan_redis.keys())
84+
for result in results:
85+
download_pic(result.decode('utf-8'))
86+
print("图片下载完毕!")

0 commit comments

Comments
 (0)