Skip to content

Commit c127d85

Browse files
committed
add ask spider
1 parent 52cfe6f commit c127d85

File tree

4 files changed

+488
-0
lines changed

4 files changed

+488
-0
lines changed

ask/jinti.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import json
2+
import random
3+
import redis
4+
import requests
5+
import threading
6+
import threadpool
7+
from lxml import etree
8+
import pymongo
9+
from bs4 import BeautifulSoup
10+
11+
clients = pymongo.MongoClient('localhost')
12+
db = clients["ask"]
13+
col = db["jinti"]
14+
15+
UA_LIST = [
16+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
17+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
18+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
19+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
20+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
21+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
22+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
23+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
24+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
26+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
28+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
29+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
30+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
31+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
32+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
33+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
34+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
35+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
36+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
37+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
38+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
39+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
40+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
41+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
42+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
43+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
44+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
45+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
46+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
47+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
48+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
49+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
50+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
51+
]
52+
headers = {
53+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
54+
'Accept-Encoding': 'gzip, deflate, sdch',
55+
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
56+
'Cache-Control': 'max-age=0',
57+
'Proxy-Connection': 'keep-alive',
58+
'Upgrade-Insecure-Requests': '1',
59+
# 'Cookie': 'ASP.NET_SessionId=1ougflfer4o204hj1mafg5y2; __utma=86209950.1130550908.1502336672.1502336672.1502336672.1; __utmb=86209950.6.10.1502336672; __utmc=86209950; __utmz=86209950.1502336672.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; visitList=267037,27249; __utma=47193788.1377076428.1502336710.1502336710.1502336710.1; __utmb=47193788.5.10.1502336710; __utmc=47193788; __utmz=47193788.1502336710.1.1.utmcsr=wenda.jinti.com|utmccn=(referral)|utmcmd=referral|utmcct=/l2n1s875z1.html',
60+
'User-Agent': random.choice(UA_LIST)
61+
}
62+
63+
proxies = {
64+
'http': "http://HP59J4A78453575D:0306D55796F8EB59@http-dyn.abuyun.com:9020",
65+
'https': "http://HP59J4A78453575D:0306D55796F8EB59@http-dyn.abuyun.com:9020"
66+
}
67+
68+
69+
# def info_to_redis(data):
70+
# r = redis.StrictRedis(host='127.0.0.1',port=6379)
71+
# print("{} into redis".format(data))
72+
# r.rpush('rednet_url', data)
73+
#
74+
# def pop_from_redis():
75+
# r = redis.StrictRedis(host='127.0.0.1',port=6379, decode_responses=True)
76+
# return r.lpop('rednet_url')
77+
78+
def start(url):
79+
print(url)
80+
try:
81+
html = requests.get(url, headers=headers, proxies=proxies).text
82+
hrefs = etree.HTML(html).xpath('//*[@class="PwentiBox"]/div[1]/dl/dd/ul/li[1]/a/@href')
83+
main(hrefs)
84+
# for i in hrefs:
85+
# print(i)
86+
# # info_to_redis(i)
87+
# with open('urls\\jinti_urls.txt', 'a') as f:
88+
# f.write(i + '\n')
89+
# # get(i)
90+
next_page = ''.join(etree.HTML(html).xpath('//*[@id="PageArea1"]/a[last()-1]/text()'))
91+
if '下一页' in next_page:
92+
u = 'http://wenda.jinti.com' + etree.HTML(html).xpath('//*[@id="PageArea1"]/a[last()-1]/@href')[0]
93+
start(u)
94+
except:
95+
start(url)
96+
97+
98+
def get(url):
99+
try:
100+
html = requests.get(url, headers=headers, proxies=proxies).content.decode('utf-8')
101+
title = ''.join(etree.HTML(html).xpath('//*[@id="hasResult"]/div[1]/div[2]/div/h2/text()'))
102+
markitup = ''.join(etree.HTML(html).xpath('//*[@id="hasResult"]/div[1]/div[2]/div/p/text()'))
103+
comment = ''.join(etree.HTML(html).xpath('//*[@id="icomeansw"]/div[2]/div/div/div[1]/p/text()'))
104+
dic = {
105+
'title': title,
106+
'markitup': markitup,
107+
'comment': comment
108+
}
109+
col.insert(dic)
110+
with open('log\\log_jinti.txt', 'w') as f:
111+
f.write(url)
112+
print(title)
113+
except:
114+
get(url)
115+
116+
117+
def main(item):
118+
pool = threadpool.ThreadPool(20)
119+
tasks = threadpool.makeRequests(get, item)
120+
[pool.putRequest(req) for req in tasks]
121+
pool.wait()
122+
123+
124+
if __name__ == '__main__':
125+
start_url = 'http://wenda.jinti.com/l2n1s533z1.html'
126+
start(start_url)

ask/rednet.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import json
2+
import random
3+
import redis
4+
import requests
5+
import threading
6+
import threadpool
7+
from lxml import etree
8+
import pymongo
9+
from bs4 import BeautifulSoup
10+
11+
UA_LIST = [
12+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
13+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
14+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
15+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
16+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
17+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
18+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
19+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
20+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
21+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
22+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
23+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
24+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
25+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
26+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
27+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
28+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
29+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
30+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
31+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
32+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
33+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
34+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
35+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
36+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
37+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
38+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
39+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
40+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
41+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
42+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
43+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
44+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
45+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
46+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
47+
]
48+
headers = {
49+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
50+
'Accept-Encoding': 'gzip, deflate, sdch',
51+
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
52+
'Cache-Control': 'max-age=0',
53+
'Proxy-Connection': 'keep-alive',
54+
'Upgrade-Insecure-Requests': '1',
55+
'User-Agent': random.choice(UA_LIST)
56+
}
57+
58+
proxies = {
59+
'http': "http://H817M021XIMJHWXD:5279D13BA73A6CFC@http-dyn.abuyun.com:9020",
60+
'https': "http://H817M021XIMJHWXD:5279D13BA73A6CFC@http-dyn.abuyun.com:9020"
61+
}
62+
63+
64+
def info_to_redis(data):
65+
r = redis.StrictRedis(host='127.0.0.1', port=6379)
66+
print("{} into redis".format(data))
67+
r.rpush('rednet_url', data)
68+
69+
70+
def pop_from_redis():
71+
r = redis.StrictRedis(host='127.0.0.1', port=6379, decode_responses=True)
72+
return r.lpop('rednet_url')
73+
74+
75+
def start(url):
76+
print(url)
77+
html = requests.get(url, headers=headers, proxies=proxies).content.decode('utf-8')
78+
hrefs = etree.HTML(html).xpath('/html/body/div[2]/div[2]/div/div/div[1]/div[2]/div[1]/div/div/div/h4/a/@href')
79+
for i in hrefs:
80+
print(i)
81+
# info_to_redis(i)
82+
with open('rednet_urls.txt', 'a') as f:
83+
f.write(i + '\n')
84+
85+
86+
def get(url):
87+
html = requests.get(url, headers=headers, proxies=proxies).content.decode('utf-8')
88+
title = ''.join(etree.HTML(html).xpath('/html/body/div[2]/div/div/div/div[1]/div[2]/div[1]/h1/text()'))
89+
markitup = ''.join(etree.HTML(html).xpath('/html/body/div[2]/div/div/div/div[1]/div[2]/div[2]/div/text()'))
90+
comment = ''.join(etree.HTML(html).xpath('//*[@id="answer_list_146747"]/div[2]/div[1]/text()'))
91+
print(title, markitup, comment)
92+
93+
94+
def main(item):
95+
pool = threadpool.ThreadPool(10)
96+
tasks = threadpool.makeRequests(start, item)
97+
[pool.putRequest(req) for req in tasks]
98+
pool.wait()
99+
100+
101+
if __name__ == '__main__':
102+
urls = ['http://ask.rednet.cn/sort_type-new__day-0__is_recommend-0__page-{}'.format(n) for n in range(1, 6888)]
103+
main(urls)

0 commit comments

Comments
 (0)