@@ -23,15 +23,7 @@ def get_proxies(self, callback):
23
23
print ('成功获取到代理' , proxy )
24
24
proxies .append (proxy )
25
25
return proxies
26
-
27
- # def crawl_daxiang(self):
28
- # url = 'http://vtp.daxiangdaili.com/ip/?tid=559363191592228&num=50&filter=on'
29
- # html = get_page(url)
30
- # if html:
31
- # urls = html.split('\n')
32
- # for url in urls:
33
- # yield url
34
-
26
+
35
27
def crawl_daili66 (self , page_count = 4 ):
36
28
"""
37
29
获取代理66
@@ -51,47 +43,6 @@ def crawl_daili66(self, page_count=4):
51
43
port = tr .find ('td:nth-child(2)' ).text ()
52
44
yield ':' .join ([ip , port ])
53
45
54
- def crawl_proxy360 (self ):
55
- """
56
- 获取Proxy360
57
- :return: 代理
58
- """
59
- start_url = 'http://www.proxy360.cn/Region/China'
60
- print ('Crawling' , start_url )
61
- html = get_page (start_url )
62
- if html :
63
- doc = pq (html )
64
- lines = doc ('div[name="list_proxy_ip"]' ).items ()
65
- for line in lines :
66
- ip = line .find ('.tbBottomLine:nth-child(1)' ).text ()
67
- port = line .find ('.tbBottomLine:nth-child(2)' ).text ()
68
- yield ':' .join ([ip , port ])
69
-
70
- def crawl_goubanjia (self ):
71
- """
72
- 获取Goubanjia
73
- :return: 代理
74
- """
75
- start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
76
- html = get_page (start_url )
77
- if html :
78
- doc = pq (html )
79
- tds = doc ('td.ip' ).items ()
80
- for td in tds :
81
- td .find ('p' ).remove ()
82
- yield td .text ().replace (' ' , '' )
83
-
84
- def crawl_ip181 (self ):
85
- start_url = 'http://www.ip181.com/'
86
- html = get_page (start_url )
87
- ip_address = re .compile ('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>' )
88
- # \s* 匹配空格,起到换行作用
89
- re_ip_address = ip_address .findall (html )
90
- for address ,port in re_ip_address :
91
- result = address + ':' + port
92
- yield result .replace (' ' , '' )
93
-
94
-
95
46
def crawl_ip3366 (self ):
96
47
for page in range (1 , 4 ):
97
48
start_url = 'http://www.ip3366.net/free/?stype=1&page={}' .format (page )
@@ -102,42 +53,6 @@ def crawl_ip3366(self):
102
53
for address , port in re_ip_address :
103
54
result = address + ':' + port
104
55
yield result .replace (' ' , '' )
105
-
106
-
107
- def crawl_kxdaili (self ):
108
- for i in range (1 , 11 ):
109
- start_url = 'http://www.kxdaili.com/ipList/{}.html#ip' .format (i )
110
- html = get_page (start_url )
111
- ip_address = re .compile ('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>' )
112
- # \s* 匹配空格,起到换行作用
113
- re_ip_address = ip_address .findall (html )
114
- for address , port in re_ip_address :
115
- result = address + ':' + port
116
- yield result .replace (' ' , '' )
117
-
118
-
119
- def crawl_premproxy (self ):
120
- for i in ['China-01' ,'China-02' ,'China-03' ,'China-04' ,'Taiwan-01' ]:
121
- start_url = 'https://premproxy.com/proxy-by-country/{}.htm' .format (i )
122
- html = get_page (start_url )
123
- if html :
124
- ip_address = re .compile ('<td data-label="IP:port ">(.*?)</td>' )
125
- re_ip_address = ip_address .findall (html )
126
- for address_port in re_ip_address :
127
- yield address_port .replace (' ' ,'' )
128
-
129
- def crawl_xroxy (self ):
130
- for i in ['CN' ,'TW' ]:
131
- start_url = 'http://www.xroxy.com/proxylist.php?country={}' .format (i )
132
- html = get_page (start_url )
133
- if html :
134
- ip_address1 = re .compile ("title='View this Proxy details'>\s*(.*).*" )
135
- re_ip_address1 = ip_address1 .findall (html )
136
- ip_address2 = re .compile ("title='Select proxies with port number .*'>(.*)</a>" )
137
- re_ip_address2 = ip_address2 .findall (html )
138
- for address ,port in zip (re_ip_address1 ,re_ip_address2 ):
139
- address_port = address + ':' + port
140
- yield address_port .replace (' ' ,'' )
141
56
142
57
def crawl_kuaidaili (self ):
143
58
for i in range (1 , 4 ):
@@ -206,15 +121,6 @@ def crawl_iphai(self):
206
121
address_port = address + ':' + port
207
122
yield address_port .replace (' ' ,'' )
208
123
209
- def crawl_89ip (self ):
210
- start_url = 'http://www.89ip.cn/apijk/?&tqsl=1000&sxa=&sxb=&tta=&ports=&ktip=&cf=1'
211
- html = get_page (start_url )
212
- if html :
213
- find_ips = re .compile ('(\d+\.\d+\.\d+\.\d+:\d+)' , re .S )
214
- ip_ports = find_ips .findall (html )
215
- for address_port in ip_ports :
216
- yield address_port
217
-
218
124
def crawl_data5u (self ):
219
125
start_url = 'http://www.data5u.com/free/gngn/index.shtml'
220
126
headers = {
0 commit comments