Skip to content

Commit aa00745

Browse files
authored
京东晒单图片爬去
京东晒单图片爬去
1 parent 6638474 commit aa00745

File tree

1 file changed

+218
-0
lines changed

1 file changed

+218
-0
lines changed

dingdong/jd.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# -*-coding:utf-8 -*-
2+
# Author:Jack Cui
3+
# Website:http://cuijiahua.com
4+
# Date:2018-7-7
5+
import os
6+
import re
7+
import sys
8+
import bs4
9+
import json
10+
import math
11+
import time
12+
import math
13+
import argparse
14+
import requests
15+
from contextlib import closing
16+
17+
def search_goods(keyword, pages):
18+
"""
19+
搜索商品
20+
Parameters:
21+
keyword - str 搜索关键词
22+
pages - int 搜索页数
23+
Returns:
24+
goods_urls - list 商品链接
25+
"""
26+
# 创建session
27+
sess = requests.Session()
28+
goods_urls = []
29+
for page in range(pages):
30+
# 第一次加载
31+
search_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
32+
'Accept-Encoding': 'gzip, deflate, br',
33+
'Accept-Language': 'zh-CN,zh;q=0.9',
34+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
35+
'Host': 'search.jd.com'}
36+
s = page*28
37+
if s == 0:
38+
s = 1
39+
# 搜索url
40+
search_url = 'https://search.jd.com/Search'
41+
search_params = {'keyword':keyword,
42+
'enc':'utf-8',
43+
'qrst':'1',
44+
'rt':'1',
45+
'stop':'1',
46+
'vt':'2',
47+
'wq':keyword,
48+
'stock':'1',
49+
'page':page*2+1,
50+
's':s,
51+
'click':'0'}
52+
search_req = sess.get(url=search_url, params=search_params, headers=search_headers, verify=False)
53+
search_req.encoding = 'utf-8'
54+
# 匹配商品链接
55+
search_req_bf = bs4.BeautifulSoup(search_req.text, 'lxml')
56+
for item in search_req_bf.find_all('li', class_='gl-item'):
57+
item_url = item.div.div.a.get('href')
58+
# 滤除广告
59+
if 'ccc-x.jd.com' not in item_url:
60+
goods_urls.append(item_url)
61+
# 继续加载log_id
62+
log_id = re.findall("log_id:'(.*)',", search_req.text)[0]
63+
64+
# 第二次加载
65+
# 继续加载url
66+
search_more_url = 'https://search.jd.com/s_new.php'
67+
search_more_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
68+
'Accept-Encoding': 'gzip, deflate, br',
69+
'Accept-Language': 'zh-CN,zh;q=0.9',
70+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
71+
'Host': 'search.jd.com',
72+
'Referer':search_req.url}
73+
s = (1+page)*25
74+
search_more_params = {'keyword':keyword,
75+
'enc':'utf-8',
76+
'qrst':'1',
77+
'rt':'1',
78+
'stop':'1',
79+
'vt':'2',
80+
'wq':keyword,
81+
'stock':'1',
82+
'page':(1+page)*2,
83+
's':s,
84+
'log_id':log_id,
85+
'scrolling':'y',
86+
'tpl':'1_M'}
87+
search_more_req = sess.get(url=search_more_url, params=search_more_params, headers=search_more_headers, verify=False)
88+
search_more_req.encoding = 'utf-8'
89+
# 匹配商品链接
90+
search_more_req_bf = bs4.BeautifulSoup(search_more_req.text, 'lxml')
91+
for item in search_more_req_bf.find_all('li', class_='gl-item'):
92+
item_url = item.div.div.a.get('href')
93+
# 滤除广告
94+
if 'ccc-x.jd.com' not in item_url:
95+
goods_urls.append(item_url)
96+
# 去重
97+
goods_urls = list(set(goods_urls))
98+
# 链接合成
99+
goods_urls = list(map(lambda x: 'http:'+x, goods_urls))
100+
return goods_urls
101+
102+
def goods_images(goods_url):
103+
"""
104+
获得商品晒图
105+
Parameters:
106+
goods_url - str 商品链接
107+
Returns:
108+
image_urls - list 图片链接
109+
"""
110+
# 创建session
111+
sess = requests.Session()
112+
image_urls = []
113+
productId = goods_url.split('/')[-1].split('.')[0]
114+
# 评论url
115+
comment_url = 'https://sclub.jd.com/comment/productPageComments.action'
116+
comment_params = {'productId':productId,
117+
'score':'0',
118+
'sortType':'5',
119+
'page':'0',
120+
'pageSize':'10',
121+
'isShadowSku':'0',
122+
'fold':'1'}
123+
comment_headers = {'Accept': '*/*',
124+
'Accept-Encoding': 'gzip, deflate, br',
125+
'Accept-Language': 'zh-CN,zh;q=0.9',
126+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
127+
'Referer':goods_url,
128+
'Host': 'sclub.jd.com'}
129+
130+
comment_req = sess.get(url=comment_url, params=comment_params, headers=comment_headers, verify=False)
131+
html = json.loads(comment_req.text)
132+
# 获得晒图个数
133+
imageListCount = html['imageListCount']
134+
# 计算晒图页数,向上取整
135+
pages = math.ceil(imageListCount / 10)
136+
for page in range(1, pages+1):
137+
# 获取晒图图片url
138+
club_url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action'
139+
now = time.time()
140+
now_str = str(now).split('.')
141+
now = now_str[0] + now_str[-1][:3]
142+
club_params = {'productId':productId,
143+
'isShadowSku':'0',
144+
'page':page,
145+
'pageSize':'10',
146+
'_':now}
147+
club_headers = comment_headers
148+
club_req = sess.get(url=club_url, params=club_params, headers=club_headers, verify=False)
149+
html = json.loads(club_req.text)
150+
for img in html['imgComments']['imgList']:
151+
image_urls.append(img['imageUrl'])
152+
# 去重
153+
image_urls = list(set(image_urls))
154+
# 链接合成
155+
image_urls = list(map(lambda x: 'http:'+x, image_urls))
156+
157+
return image_urls
158+
159+
def download_image(path, image_url):
160+
"""
161+
视频下载
162+
Parameters:
163+
path - str 图片保存地址
164+
image_url - str 图片下载地址
165+
Returns:
166+
None
167+
"""
168+
print(image_url)
169+
filename = image_url.split('/')[-1]
170+
image_path = os.path.join(path, filename)
171+
download_headers = {'Accept': '*/*',
172+
'Accept-Encoding': 'gzip, deflate, br',
173+
'Accept-Language': 'zh-CN,zh;q=0.9',
174+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'}
175+
size = 0
176+
with closing(requests.get(image_url, headers=download_headers, stream=True)) as response:
177+
chunk_size = 1024
178+
content_size = int(response.headers['content-length'])
179+
if response.status_code == 200:
180+
sys.stdout.write(filename+'下载中:\n')
181+
sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024))
182+
183+
with open(image_path, 'wb') as file:
184+
for data in response.iter_content(chunk_size = chunk_size):
185+
file.write(data)
186+
size += len(data)
187+
file.flush()
188+
sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r')
189+
sys.stdout.flush()
190+
191+
def run(path, keyword, num):
192+
flag = False
193+
pages = 1
194+
while flag == False:
195+
goods_urls = search_goods(keyword, pages)
196+
print(goods_urls)
197+
if len(goods_urls) > num:
198+
flag = True
199+
else:
200+
pages += 1
201+
202+
if keyword not in os.listdir():
203+
os.mkdir(keyword)
204+
path = os.path.join(path, keyword)
205+
for goods_url in goods_urls[:num]:
206+
image_urls = goods_images(goods_url)
207+
for image_url in image_urls:
208+
download_image(path, image_url)
209+
210+
if __name__ == '__main__':
211+
if len(sys.argv) == 1:
212+
sys.argv.append('--help')
213+
parser = argparse.ArgumentParser()
214+
parser.add_argument('-d', '--dir', help=('store path'), type=str, default=os.path.dirname(__file__))
215+
parser.add_argument('-k', '--keyword', required=True, help=('search content'))
216+
parser.add_argument('-n', '--num', help=('the number of goods to download images'), type=int, default=1)
217+
args = parser.parse_args()
218+
run(args.dir, args.keyword, args.num)

0 commit comments

Comments
 (0)