Skip to content

Commit 39bcf6b

Browse files
committed
update
1 parent 74ea581 commit 39bcf6b

File tree

16 files changed

+397
-377
lines changed

16 files changed

+397
-377
lines changed

examples/example.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

examples/proxytest.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

proxypool/api.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,41 @@
11
from flask import Flask, g
2-
32
from .db import RedisClient
3+
from .setting import API_HOST, API_PORT
44

55
__all__ = ['app']
66

77
app = Flask(__name__)
88

9-
109
def get_conn():
10+
"""
11+
get redis client object
12+
:return:
13+
"""
1114
if not hasattr(g, 'redis'):
1215
g.redis = RedisClient()
1316
return g.redis
1417

15-
1618
@app.route('/')
1719
def index():
1820
return '<h2>Welcome to Proxy Pool System</h2>'
1921

20-
2122
@app.route('/random')
2223
def get_proxy():
2324
"""
24-
Get a proxy
25+
get a random proxy
2526
:return: 随机代理
2627
"""
2728
conn = get_conn()
2829
return conn.random()
2930

30-
3131
@app.route('/count')
3232
def get_counts():
3333
"""
34-
Get the count of proxies
34+
get the count of proxies
3535
:return: 代理池总量
3636
"""
3737
conn = get_conn()
3838
return str(conn.count())
3939

40-
4140
if __name__ == '__main__':
42-
app.run()
41+
app.run(host=API_HOST, port=API_PORT, threaded=True)

proxypool/crawler.py

Lines changed: 0 additions & 147 deletions
This file was deleted.

proxypool/crawlers/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .daili66 import Daili66Crawler
2+
from .ip3366 import IP3366Crawler
3+
from .iphai import IPHaiCrawler
4+
5+
6+
__all__ = [
7+
Daili66Crawler,
8+
IP3366Crawler,
9+
IP3366Crawler
10+
]

proxypool/crawlers/base.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from retrying import retry
2+
import requests
3+
from loguru import logger
4+
5+
6+
class BaseCrawler(object):
7+
urls = []
8+
9+
@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None)
10+
def fetch(self, url, **kwargs):
11+
try:
12+
response = requests.get(url, **kwargs)
13+
if response.status_code == 200:
14+
return response.text
15+
except requests.ConnectionError:
16+
return
17+
18+
@logger.catch
19+
def crawl(self):
20+
"""
21+
crawl main method
22+
"""
23+
for url in self.urls:
24+
logger.info(f'fetching {url}')
25+
html = self.fetch(url)
26+
for proxy in self.parse(html):
27+
yield proxy

proxypool/crawlers/daili66.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from pyquery import PyQuery as pq
2+
from proxypool.proxy import Proxy
3+
from proxypool.crawlers.base import BaseCrawler
4+
5+
6+
BASE_URL = 'http://www.636ip.cn/{page}.html'
7+
MAX_PAGE = 5
8+
9+
class Daili66Crawler(BaseCrawler):
10+
"""
11+
daili66 crawler, http://www.66ip.cn/1.html
12+
"""
13+
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
14+
15+
16+
def parse(self, html):
17+
"""
18+
parse html file to get proxies
19+
:return:
20+
"""
21+
doc = pq(html)
22+
trs = doc('.containerbox table tr:gt(0)').items()
23+
for tr in trs:
24+
host = tr.find('td:nth-child(1)').text()
25+
port = int(tr.find('td:nth-child(2)').text())
26+
yield Proxy(host=host, port=port)
27+
28+
if __name__ == '__main__':
29+
crawler = Daili66Crawler()
30+
for proxy in crawler.crawl():
31+
print(proxy)

proxypool/crawlers/ip3366.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from proxypool.crawlers.base import BaseCrawler
2+
from proxypool.proxy import Proxy
3+
import re
4+
5+
6+
MAX_PAGE = 5
7+
BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
8+
9+
class IP3366Crawler(BaseCrawler):
10+
"""
11+
ip3366 crawler, http://www.ip3366.net/
12+
"""
13+
urls = [BASE_URL.format(page=i) for i in range(1, 8)]
14+
15+
16+
def parse(self, html):
17+
"""
18+
parse html file to get proxies
19+
:return:
20+
"""
21+
ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
22+
# \s * 匹配空格,起到换行作用
23+
re_ip_address = ip_address.findall(html)
24+
for address, port in re_ip_address:
25+
proxy = Proxy(host=address.strip(), port=int(port.strip()))
26+
yield proxy
27+
28+
if __name__ == '__main__':
29+
crawler = IP3366Crawler()
30+
for proxy in crawler.crawl():
31+
print(proxy)

proxypool/crawlers/iphai.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from proxypool.crawlers.base import BaseCrawler
2+
from proxypool.proxy import Proxy
3+
import re
4+
5+
6+
BASE_URL = 'http://www.iphai.com/'
7+
8+
class IPHaiCrawler(BaseCrawler):
9+
"""
10+
iphai crawler, http://www.iphai.com/
11+
"""
12+
urls = [BASE_URL]
13+
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
find_tr = re.compile('<tr>(.*?)</tr>', re.S)
21+
trs = find_tr.findall(html)
22+
for s in range(1, len(trs)):
23+
find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
24+
re_ip_address = find_ip.findall(trs[s])
25+
find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
26+
re_port = find_port.findall(trs[s])
27+
for address, port in zip(re_ip_address, re_port):
28+
proxy = Proxy(host=address.strip(), port=int(port.strip()))
29+
yield proxy
30+
31+
if __name__ == '__main__':
32+
crawler = IPHaiCrawler()
33+
for proxy in crawler.crawl():
34+
print(proxy)

0 commit comments

Comments
 (0)