|
1 |
| -""" |
2 |
| -Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图 |
3 |
| -""" |
4 |
| -import time |
5 |
| -import random |
6 |
| -import requests as r |
7 |
| -import os |
8 |
| -import re |
9 |
| -import json |
10 |
| - |
11 |
| -# 图片拼接url后,分别是前缀后缀 |
12 |
| -img_start_url = 'http://img.hb.aicdn.com/' |
13 |
| -img_end = '_fw658' |
14 |
| - |
15 |
| -# 获取pins的正则 |
16 |
| -boards_pattern = re.compile(r'pins":(.*)};') |
17 |
| - |
18 |
| -# 修改pin_id的正则 |
19 |
| -max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)') |
20 |
| - |
21 |
| -# 图片id输出文件 |
22 |
| -pin_ids_file = 'pin_ids.txt' |
23 |
| - |
24 |
| -# 图片输出路径 |
25 |
| -pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/') |
26 |
| - |
27 |
| -ajax_headers = { |
28 |
| - 'Host': 'huaban.com', |
29 |
| - 'Accept': 'application/json', |
30 |
| - 'X-Request': 'JSON', |
31 |
| - 'X-Requested-With': 'XMLHttpRequest' |
32 |
| -} |
33 |
| - |
34 |
| -# 往文件写入内容(追加) |
35 |
| -def write_str_data(content, file_path, type="a+"): |
36 |
| - try: |
37 |
| - with open(file_path, type, encoding='utf-8') as f: |
38 |
| - f.write(content + "\n", ) |
39 |
| - except OSError as reason: |
40 |
| - print(str(reason)) |
41 |
| - |
42 |
| -# 按行读取文件里的内容添加到列表中返回 |
43 |
| -def load_data(file_path): |
44 |
| - if os.path.exists(file_path): |
45 |
| - data_list = [] |
46 |
| - with open(file_path, "r+", encoding='utf-8') as f: |
47 |
| - for ip in f: |
48 |
| - data_list.append(ip.replace("\n", "")) |
49 |
| - return data_list |
50 |
| - |
51 |
| -# 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询 |
52 |
| -def get_boards_index_data(url): |
53 |
| - print("请求:" + url) |
54 |
| - resp = r.get(url).text |
55 |
| - result = boards_pattern.search(resp) |
56 |
| - json_dict = json.loads(result.group(1)) |
57 |
| - for item in json_dict: |
58 |
| - write_str_data(item['file']['key'], pin_ids_file) |
59 |
| - # 返回最后一个pin_id |
60 |
| - pin_id = json_dict[-1]['pin_id'] |
61 |
| - return pin_id |
62 |
| - |
63 |
| - |
64 |
| -# 模拟Ajax请求更多数据 |
65 |
| -def get_json_list(url): |
66 |
| - print("请求:" + url) |
67 |
| - resp = r.get(url, headers=ajax_headers) |
68 |
| - if resp is None: |
69 |
| - return None |
70 |
| - else: |
71 |
| - json_dict = json.loads(resp.text) |
72 |
| - pins = json_dict['board']['pins'] |
73 |
| - if len(pins) == 0: |
74 |
| - return None |
75 |
| - else: |
76 |
| - for item in pins: |
77 |
| - write_str_data(item['file']['key'], pin_ids_file) |
78 |
| - return pins[-1]['pin_id'] |
79 |
| - |
80 |
| - |
81 |
| -# 下载图片的方法 |
82 |
| -def download_pic(key): |
83 |
| - url = img_start_url + key + img_end |
84 |
| - resp = r.get(url).content |
85 |
| - try: |
86 |
| - print("下载图片:" + url) |
87 |
| - pic_name = key + ".jpg" |
88 |
| - with open(pic_download_dir + pic_name, "wb+") as f: |
89 |
| - f.write(resp) |
90 |
| - except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason: |
91 |
| - print(str(reason)) |
92 |
| - |
93 |
| - |
94 |
| -if __name__ == '__main__': |
95 |
| - if not os.path.exists(pic_download_dir): |
96 |
| - os.makedirs(pic_download_dir) |
97 |
| - if os.path.exists(pin_ids_file): |
98 |
| - os.remove(pin_ids_file) |
99 |
| - # 一个画板链接,可自行替换 |
100 |
| - boards_url = 'http://huaban.com/boards/279523/' |
101 |
| - board_last_pin_id = get_boards_index_data(boards_url) |
102 |
| - board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1' |
103 |
| - while True: |
104 |
| - board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url)) |
105 |
| - if board_last_pin_id is None: |
106 |
| - break |
107 |
| - pic_url_list = load_data(pin_ids_file) |
108 |
| - for key in pic_url_list: |
109 |
| - download_pic(key) |
110 |
| - print("下载完成~") |
| 1 | +""" |
| 2 | +Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图 |
| 3 | +""" |
| 4 | +import requests as r |
| 5 | +import os |
| 6 | +import re |
| 7 | +import json |
| 8 | + |
| 9 | +# 图片URL拼接的前缀和后缀 |
| 10 | +img_start_url = 'http://img.hb.aicdn.com/' |
| 11 | +img_end = '_fw658' |
| 12 | + |
| 13 | +# 图片key的保存文件 |
| 14 | +pic_key_file = 'pin_ids.txt' |
| 15 | + |
| 16 | +# 获取pins的正则 |
| 17 | +boards_pattern = re.compile(r'pins":(.*)};') |
| 18 | + |
| 19 | +# 修改pin_id的正则 |
| 20 | +max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)') |
| 21 | + |
| 22 | +# 图片保存路径 |
| 23 | +pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/') |
| 24 | + |
| 25 | +# Ajax模拟的请求头 |
| 26 | +ajax_headers = { |
| 27 | + 'Host': 'huaban.com', |
| 28 | + 'Accept': 'application/json', |
| 29 | + 'X-Request': 'JSON', |
| 30 | + 'X-Requested-With': 'XMLHttpRequest' |
| 31 | +} |
| 32 | + |
| 33 | + |
| 34 | +# 以追加的形式往文件中写入内容 |
| 35 | +def write_str_data(content, file_path): |
| 36 | + try: |
| 37 | + with open(file_path, 'a+', encoding='utf-8') as f: |
| 38 | + f.write(content + "\n", ) |
| 39 | + except OSError as reason: |
| 40 | + print(str(reason)) |
| 41 | + |
| 42 | + |
| 43 | +# 按行读取文件里的内容添加到列表中返回 |
| 44 | +def load_data(file_path): |
| 45 | + if os.path.exists(file_path): |
| 46 | + data_list = [] |
| 47 | + with open(file_path, "r+", encoding='utf-8') as f: |
| 48 | + for ip in f: |
| 49 | + data_list.append(ip.replace("\n", "")) |
| 50 | + return data_list |
| 51 | + |
| 52 | + |
| 53 | +# 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询 |
| 54 | +def get_boards_index_data(url): |
| 55 | + print("请求:" + url) |
| 56 | + resp = r.get(url).text |
| 57 | + result = boards_pattern.search(resp) |
| 58 | + json_dict = json.loads(result.group(1)) |
| 59 | + for item in json_dict: |
| 60 | + write_str_data(item['file']['key'], pic_key_file) |
| 61 | + # 返回最后一个pin_id |
| 62 | + pin_id = json_dict[-1]['pin_id'] |
| 63 | + return pin_id |
| 64 | + |
| 65 | + |
| 66 | +# 模拟Ajax请求更多数据 |
| 67 | +def get_json_list(url): |
| 68 | + print("请求:" + url) |
| 69 | + resp = r.get(url, headers=ajax_headers) |
| 70 | + if resp is None: |
| 71 | + return None |
| 72 | + else: |
| 73 | + json_dict = json.loads(resp.text) |
| 74 | + pins = json_dict['board']['pins'] |
| 75 | + if len(pins) == 0: |
| 76 | + return None |
| 77 | + else: |
| 78 | + for item in pins: |
| 79 | + write_str_data(item['file']['key'], pic_key_file) |
| 80 | + return pins[-1]['pin_id'] |
| 81 | + |
| 82 | + |
| 83 | +# 下载图片的方法 |
| 84 | +def download_pic(key): |
| 85 | + url = img_start_url + key + img_end |
| 86 | + resp = r.get(url).content |
| 87 | + try: |
| 88 | + print("下载图片:" + url) |
| 89 | + pic_name = key + ".jpg" |
| 90 | + with open(pic_download_dir + pic_name, "wb+") as f: |
| 91 | + f.write(resp) |
| 92 | + except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason: |
| 93 | + print(str(reason)) |
| 94 | + |
| 95 | + |
| 96 | +if __name__ == '__main__': |
| 97 | + if not os.path.exists(pic_download_dir): |
| 98 | + os.makedirs(pic_download_dir) |
| 99 | + # 判断图片key的保存文件是否存在,存在的话删除 |
| 100 | + if os.path.exists(pic_key_file): |
| 101 | + os.remove(pic_key_file) |
| 102 | + # 一个画板链接,可自行替换 |
| 103 | + boards_url = 'http://huaban.com/boards/279523/' |
| 104 | + board_last_pin_id = get_boards_index_data(boards_url) |
| 105 | + board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1' |
| 106 | + while True: |
| 107 | + board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url)) |
| 108 | + if board_last_pin_id is None: |
| 109 | + break |
| 110 | + pic_url_list = load_data(pic_key_file) |
| 111 | + for key in pic_url_list: |
| 112 | + download_pic(key) |
| 113 | + print("所有图片下载完成~") |
0 commit comments