Skip to content

Commit 865a13e

Browse files
author
enuiai
committed
删除无效爬虫
1 parent 9212cec commit 865a13e

File tree

1 file changed

+1
-95
lines changed

1 file changed

+1
-95
lines changed

proxypool/crawler.py

Lines changed: 1 addition & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,7 @@ def get_proxies(self, callback):
2323
print('成功获取到代理', proxy)
2424
proxies.append(proxy)
2525
return proxies
26-
27-
# def crawl_daxiang(self):
28-
# url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=50&filter=on'
29-
# html = get_page(url)
30-
# if html:
31-
# urls = html.split('\n')
32-
# for url in urls:
33-
# yield url
34-
26+
3527
def crawl_daili66(self, page_count=4):
3628
"""
3729
获取代理66
@@ -51,47 +43,6 @@ def crawl_daili66(self, page_count=4):
5143
port = tr.find('td:nth-child(2)').text()
5244
yield ':'.join([ip, port])
5345

54-
def crawl_proxy360(self):
55-
"""
56-
获取Proxy360
57-
:return: 代理
58-
"""
59-
start_url = 'http://www.proxy360.cn/Region/China'
60-
print('Crawling', start_url)
61-
html = get_page(start_url)
62-
if html:
63-
doc = pq(html)
64-
lines = doc('div[name="list_proxy_ip"]').items()
65-
for line in lines:
66-
ip = line.find('.tbBottomLine:nth-child(1)').text()
67-
port = line.find('.tbBottomLine:nth-child(2)').text()
68-
yield ':'.join([ip, port])
69-
70-
def crawl_goubanjia(self):
71-
"""
72-
获取Goubanjia
73-
:return: 代理
74-
"""
75-
start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
76-
html = get_page(start_url)
77-
if html:
78-
doc = pq(html)
79-
tds = doc('td.ip').items()
80-
for td in tds:
81-
td.find('p').remove()
82-
yield td.text().replace(' ', '')
83-
84-
def crawl_ip181(self):
85-
start_url = 'http://www.ip181.com/'
86-
html = get_page(start_url)
87-
ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
88-
# \s* 匹配空格,起到换行作用
89-
re_ip_address = ip_address.findall(html)
90-
for address,port in re_ip_address:
91-
result = address + ':' + port
92-
yield result.replace(' ', '')
93-
94-
9546
def crawl_ip3366(self):
9647
for page in range(1, 4):
9748
start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
@@ -102,42 +53,6 @@ def crawl_ip3366(self):
10253
for address, port in re_ip_address:
10354
result = address+':'+ port
10455
yield result.replace(' ', '')
105-
106-
107-
def crawl_kxdaili(self):
108-
for i in range(1, 11):
109-
start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i)
110-
html = get_page(start_url)
111-
ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
112-
# \s* 匹配空格,起到换行作用
113-
re_ip_address = ip_address.findall(html)
114-
for address, port in re_ip_address:
115-
result = address + ':' + port
116-
yield result.replace(' ', '')
117-
118-
119-
def crawl_premproxy(self):
120-
for i in ['China-01','China-02','China-03','China-04','Taiwan-01']:
121-
start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(i)
122-
html = get_page(start_url)
123-
if html:
124-
ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>')
125-
re_ip_address = ip_address.findall(html)
126-
for address_port in re_ip_address:
127-
yield address_port.replace(' ','')
128-
129-
def crawl_xroxy(self):
130-
for i in ['CN','TW']:
131-
start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(i)
132-
html = get_page(start_url)
133-
if html:
134-
ip_address1 = re.compile("title='View this Proxy details'>\s*(.*).*")
135-
re_ip_address1 = ip_address1.findall(html)
136-
ip_address2 = re.compile("title='Select proxies with port number .*'>(.*)</a>")
137-
re_ip_address2 = ip_address2.findall(html)
138-
for address,port in zip(re_ip_address1,re_ip_address2):
139-
address_port = address+':'+port
140-
yield address_port.replace(' ','')
14156

14257
def crawl_kuaidaili(self):
14358
for i in range(1, 4):
@@ -206,15 +121,6 @@ def crawl_iphai(self):
206121
address_port = address+':'+port
207122
yield address_port.replace(' ','')
208123

209-
def crawl_89ip(self):
210-
start_url = 'http://www.89ip.cn/apijk/?&tqsl=1000&sxa=&sxb=&tta=&ports=&ktip=&cf=1'
211-
html = get_page(start_url)
212-
if html:
213-
find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S)
214-
ip_ports = find_ips.findall(html)
215-
for address_port in ip_ports:
216-
yield address_port
217-
218124
def crawl_data5u(self):
219125
start_url = 'http://www.data5u.com/free/gngn/index.shtml'
220126
headers = {

0 commit comments

Comments
 (0)