File tree Expand file tree Collapse file tree 16 files changed +397
-377
lines changed Expand file tree Collapse file tree 16 files changed +397
-377
lines changed Load Diff This file was deleted.
Load Diff This file was deleted.
Original file line number Diff line number Diff line change 1
1
from flask import Flask , g
2
-
3
2
from .db import RedisClient
3
+ from .setting import API_HOST , API_PORT
4
4
5
5
__all__ = ['app' ]
6
6
7
7
app = Flask (__name__ )
8
8
9
-
10
9
def get_conn ():
10
+ """
11
+ get redis client object
12
+ :return:
13
+ """
11
14
if not hasattr (g , 'redis' ):
12
15
g .redis = RedisClient ()
13
16
return g .redis
14
17
15
-
16
18
@app .route ('/' )
17
19
def index ():
18
20
return '<h2>Welcome to Proxy Pool System</h2>'
19
21
20
-
21
22
@app .route ('/random' )
22
23
def get_proxy ():
23
24
"""
24
- Get a proxy
25
+ get a random proxy
25
26
:return: 随机代理
26
27
"""
27
28
conn = get_conn ()
28
29
return conn .random ()
29
30
30
-
31
31
@app .route ('/count' )
32
32
def get_counts ():
33
33
"""
34
- Get the count of proxies
34
+ get the count of proxies
35
35
:return: 代理池总量
36
36
"""
37
37
conn = get_conn ()
38
38
return str (conn .count ())
39
39
40
-
41
40
if __name__ == '__main__' :
42
- app .run ()
41
+ app .run (host = API_HOST , port = API_PORT , threaded = True )
Load Diff This file was deleted.
Original file line number Diff line number Diff line change
1
+ from .daili66 import Daili66Crawler
2
+ from .ip3366 import IP3366Crawler
3
+ from .iphai import IPHaiCrawler
4
+
5
+
6
+ __all__ = [
7
+ Daili66Crawler ,
8
+ IP3366Crawler ,
9
+ IP3366Crawler
10
+ ]
Original file line number Diff line number Diff line change
1
+ from retrying import retry
2
+ import requests
3
+ from loguru import logger
4
+
5
+
6
+ class BaseCrawler (object ):
7
+ urls = []
8
+
9
+ @retry (stop_max_attempt_number = 3 , retry_on_result = lambda x : x is None )
10
+ def fetch (self , url , ** kwargs ):
11
+ try :
12
+ response = requests .get (url , ** kwargs )
13
+ if response .status_code == 200 :
14
+ return response .text
15
+ except requests .ConnectionError :
16
+ return
17
+
18
+ @logger .catch
19
+ def crawl (self ):
20
+ """
21
+ crawl main method
22
+ """
23
+ for url in self .urls :
24
+ logger .info (f'fetching { url } ' )
25
+ html = self .fetch (url )
26
+ for proxy in self .parse (html ):
27
+ yield proxy
Original file line number Diff line number Diff line change
1
+ from pyquery import PyQuery as pq
2
+ from proxypool .proxy import Proxy
3
+ from proxypool .crawlers .base import BaseCrawler
4
+
5
+
6
+ BASE_URL = 'http://www.636ip.cn/{page}.html'
7
+ MAX_PAGE = 5
8
+
9
+ class Daili66Crawler (BaseCrawler ):
10
+ """
11
+ daili66 crawler, http://www.66ip.cn/1.html
12
+ """
13
+ urls = [BASE_URL .format (page = page ) for page in range (1 , MAX_PAGE + 1 )]
14
+
15
+
16
+ def parse (self , html ):
17
+ """
18
+ parse html file to get proxies
19
+ :return:
20
+ """
21
+ doc = pq (html )
22
+ trs = doc ('.containerbox table tr:gt(0)' ).items ()
23
+ for tr in trs :
24
+ host = tr .find ('td:nth-child(1)' ).text ()
25
+ port = int (tr .find ('td:nth-child(2)' ).text ())
26
+ yield Proxy (host = host , port = port )
27
+
28
+ if __name__ == '__main__' :
29
+ crawler = Daili66Crawler ()
30
+ for proxy in crawler .crawl ():
31
+ print (proxy )
Original file line number Diff line number Diff line change
1
+ from proxypool .crawlers .base import BaseCrawler
2
+ from proxypool .proxy import Proxy
3
+ import re
4
+
5
+
6
+ MAX_PAGE = 5
7
+ BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
8
+
9
+ class IP3366Crawler (BaseCrawler ):
10
+ """
11
+ ip3366 crawler, http://www.ip3366.net/
12
+ """
13
+ urls = [BASE_URL .format (page = i ) for i in range (1 , 8 )]
14
+
15
+
16
+ def parse (self , html ):
17
+ """
18
+ parse html file to get proxies
19
+ :return:
20
+ """
21
+ ip_address = re .compile ('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>' )
22
+ # \s * 匹配空格,起到换行作用
23
+ re_ip_address = ip_address .findall (html )
24
+ for address , port in re_ip_address :
25
+ proxy = Proxy (host = address .strip (), port = int (port .strip ()))
26
+ yield proxy
27
+
28
+ if __name__ == '__main__' :
29
+ crawler = IP3366Crawler ()
30
+ for proxy in crawler .crawl ():
31
+ print (proxy )
Original file line number Diff line number Diff line change
1
+ from proxypool .crawlers .base import BaseCrawler
2
+ from proxypool .proxy import Proxy
3
+ import re
4
+
5
+
6
+ BASE_URL = 'http://www.iphai.com/'
7
+
8
+ class IPHaiCrawler (BaseCrawler ):
9
+ """
10
+ iphai crawler, http://www.iphai.com/
11
+ """
12
+ urls = [BASE_URL ]
13
+
14
+
15
+ def parse (self , html ):
16
+ """
17
+ parse html file to get proxies
18
+ :return:
19
+ """
20
+ find_tr = re .compile ('<tr>(.*?)</tr>' , re .S )
21
+ trs = find_tr .findall (html )
22
+ for s in range (1 , len (trs )):
23
+ find_ip = re .compile ('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>' , re .S )
24
+ re_ip_address = find_ip .findall (trs [s ])
25
+ find_port = re .compile ('<td>\s+(\d+)\s+</td>' , re .S )
26
+ re_port = find_port .findall (trs [s ])
27
+ for address , port in zip (re_ip_address , re_port ):
28
+ proxy = Proxy (host = address .strip (), port = int (port .strip ()))
29
+ yield proxy
30
+
31
+ if __name__ == '__main__' :
32
+ crawler = IPHaiCrawler ()
33
+ for proxy in crawler .crawl ():
34
+ print (proxy )
You can’t perform that action at this time.
0 commit comments